40 files changed, 18303 insertions, 0 deletions
diff --git a/qa/tasks/cephfs/__init__.py b/qa/tasks/cephfs/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/qa/tasks/cephfs/__init__.py
diff --git a/qa/tasks/cephfs/cephfs_test_case.py b/qa/tasks/cephfs/cephfs_test_case.py
new file mode 100644
index 00000000..f901f44b
--- /dev/null
+++ b/qa/tasks/cephfs/cephfs_test_case.py
@@ -0,0 +1,324 @@
+import time
+import json
+import logging
+from unittest import case
+from tasks.ceph_test_case import CephTestCase
+import os
+import re
+
+from tasks.cephfs.fuse_mount import FuseMount
+
+from teuthology.orchestra import run
+from teuthology.orchestra.run import CommandFailedError
+from teuthology.contextutil import safe_while
+
+
+log = logging.getLogger(__name__)
+
+
+def for_teuthology(f):
+    """
+    Decorator that adds an "is_for_teuthology" attribute to the wrapped function
+    """
+    f.is_for_teuthology = True
+    return f
+
+
+def needs_trimming(f):
+    """
+    Mark fn as requiring a client capable of trimming its cache (i.e. for ceph-fuse
+    this means it needs to be able to run as root, currently)
+    """
+    f.needs_trimming = True
+    return f
+
+
+class CephFSTestCase(CephTestCase):
+    """
+    Test case for Ceph FS, requires caller to populate Filesystem and Mounts,
+    into the fs, mount_a, mount_b class attributes (setting mount_b is optional)
+
+    Handles resetting the cluster under test between tests.
+    """
+
+    # FIXME weird explicit naming
+    mount_a = None
+    mount_b = None
+    recovery_mount = None
+
+    # Declarative test requirements: subclasses should override these to indicate
+    # their special needs.  If not met, tests will be skipped.
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 1
+    REQUIRE_KCLIENT_REMOTE = False
+    REQUIRE_ONE_CLIENT_REMOTE = False
+
+    # Whether to create the default filesystem during setUp
+    REQUIRE_FILESYSTEM = True
+
+    # requires REQUIRE_FILESYSTEM = True
+    REQUIRE_RECOVERY_FILESYSTEM = False
+
+    LOAD_SETTINGS = []
+
+    def setUp(self):
+        super(CephFSTestCase, self).setUp()
+
+        self.config_set('mon', 'mon_allow_pool_delete', True)
+
+        if len(self.mds_cluster.mds_ids) < self.MDSS_REQUIRED:
+            raise case.SkipTest("Only have {0} MDSs, require {1}".format(
+                len(self.mds_cluster.mds_ids), self.MDSS_REQUIRED
+            ))
+
+        if len(self.mounts) < self.CLIENTS_REQUIRED:
+            raise case.SkipTest("Only have {0} clients, require {1}".format(
+                len(self.mounts), self.CLIENTS_REQUIRED
+            ))
+
+        if self.REQUIRE_KCLIENT_REMOTE:
+            if not isinstance(self.mounts[0], FuseMount) or not isinstance(self.mounts[1], FuseMount):
+                # kclient kill() power cycles nodes, so requires clients to each be on
+                # their own node
+                if self.mounts[0].client_remote.hostname == self.mounts[1].client_remote.hostname:
+                    raise case.SkipTest("kclient clients must be on separate nodes")
+
+        if self.REQUIRE_ONE_CLIENT_REMOTE:
+            if self.mounts[0].client_remote.hostname in self.mds_cluster.get_mds_hostnames():
+                raise case.SkipTest("Require first client to be on separate server from MDSs")
+
+        # Create friendly mount_a, mount_b attrs
+        for i in range(0, self.CLIENTS_REQUIRED):
+            setattr(self, "mount_{0}".format(chr(ord('a') + i)), self.mounts[i])
+
+        self.mds_cluster.clear_firewall()
+
+        # Unmount all clients, we are about to blow away the filesystem
+        for mount in self.mounts:
+            if mount.is_mounted():
+                mount.umount_wait(force=True)
+
+        # To avoid any issues with e.g. unlink bugs, we destroy and recreate
+        # the filesystem rather than just doing a rm -rf of files
+        self.mds_cluster.delete_all_filesystems()
+        self.mds_cluster.mds_restart() # to reset any run-time configs, etc.
+        self.fs = None # is now invalid!
+        self.recovery_fs = None
+
+        # In case anything is in the OSD blacklist list, clear it out.  This is to avoid
+        # the OSD map changing in the background (due to blacklist expiry) while tests run.
+        try:
+            self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "clear")
+        except CommandFailedError:
+            # Fallback for older Ceph cluster
+            blacklist = json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd("osd",
+                                  "dump", "--format=json-pretty"))['blacklist']
+            log.info("Removing {0} blacklist entries".format(len(blacklist)))
+            for addr, blacklisted_at in blacklist.items():
+                self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "rm", addr)
+
+        client_mount_ids = [m.client_id for m in self.mounts]
+        # In case the test changes the IDs of clients, stash them so that we can
+        # reset in tearDown
+        self._original_client_ids = client_mount_ids
+        log.info(client_mount_ids)
+
+        # In case there were any extra auth identities around from a previous
+        # test, delete them
+        for entry in self.auth_list():
+            ent_type, ent_id = entry['entity'].split(".")
+            if ent_type == "client" and ent_id not in client_mount_ids and ent_id != "admin":
+                self.mds_cluster.mon_manager.raw_cluster_cmd("auth", "del", entry['entity'])
+
+        if self.REQUIRE_FILESYSTEM:
+            self.fs = self.mds_cluster.newfs(create=True)
+
+            # In case some test messed with auth caps, reset them
+            for client_id in client_mount_ids:
+                self.mds_cluster.mon_manager.raw_cluster_cmd_result(
+                    'auth', 'caps', "client.{0}".format(client_id),
+                    'mds', 'allow',
+                    'mon', 'allow r',
+                    'osd', 'allow rw pool={0}'.format(self.fs.get_data_pool_name()))
+
+            # wait for ranks to become active
+            self.fs.wait_for_daemons()
+
+            # Mount the requested number of clients
+            for i in range(0, self.CLIENTS_REQUIRED):
+                self.mounts[i].mount()
+                self.mounts[i].wait_until_mounted()
+
+        if self.REQUIRE_RECOVERY_FILESYSTEM:
+            if not self.REQUIRE_FILESYSTEM:
+                raise case.SkipTest("Recovery filesystem requires a primary filesystem as well")
+            self.fs.mon_manager.raw_cluster_cmd('fs', 'flag', 'set',
+                                                'enable_multiple', 'true',
+                                                '--yes-i-really-mean-it')
+            self.recovery_fs = self.mds_cluster.newfs(name="recovery_fs", create=False)
+            self.recovery_fs.set_metadata_overlay(True)
+            self.recovery_fs.set_data_pool_name(self.fs.get_data_pool_name())
+            self.recovery_fs.create()
+            self.recovery_fs.getinfo(refresh=True)
+            self.recovery_fs.mds_restart()
+            self.recovery_fs.wait_for_daemons()
+
+        # Load an config settings of interest
+        for setting in self.LOAD_SETTINGS:
+            setattr(self, setting, float(self.fs.mds_asok(
+                ['config', 'get', setting], list(self.mds_cluster.mds_ids)[0]
+            )[setting]))
+
+        self.configs_set = set()
+
+    def tearDown(self):
+        self.mds_cluster.clear_firewall()
+        for m in self.mounts:
+            m.teardown()
+
+        for i, m in enumerate(self.mounts):
+            m.client_id = self._original_client_ids[i]
+
+        for subsys, key in self.configs_set:
+            self.mds_cluster.clear_ceph_conf(subsys, key)
+
+        return super(CephFSTestCase, self).tearDown()
+
+    def set_conf(self, subsys, key, value):
+        self.configs_set.add((subsys, key))
+        self.mds_cluster.set_ceph_conf(subsys, key, value)
+
+    def auth_list(self):
+        """
+        Convenience wrapper on "ceph auth ls"
+        """
+        return json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd(
+            "auth", "ls", "--format=json-pretty"
+        ))['auth_dump']
+
+    def assert_session_count(self, expected, ls_data=None, mds_id=None):
+        if ls_data is None:
+            ls_data = self.fs.mds_asok(['session', 'ls'], mds_id=mds_id)
+
+        alive_count = len([s for s in ls_data if s['state'] != 'killing'])
+
+        self.assertEqual(expected, alive_count, "Expected {0} sessions, found {1}".format(
+            expected, alive_count
+        ))
+
+    def assert_session_state(self, client_id,  expected_state):
+        self.assertEqual(
+            self._session_by_id(
+                self.fs.mds_asok(['session', 'ls'])).get(client_id, {'state': None})['state'],
+            expected_state)
+
+    def get_session_data(self, client_id):
+        return self._session_by_id(client_id)
+
+    def _session_list(self):
+        ls_data = self.fs.mds_asok(['session', 'ls'])
+        ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']]
+        return ls_data
+
+    def get_session(self, client_id, session_ls=None):
+        if session_ls is None:
+            session_ls = self.fs.mds_asok(['session', 'ls'])
+
+        return self._session_by_id(session_ls)[client_id]
+
+    def _session_by_id(self, session_ls):
+        return dict([(s['id'], s) for s in session_ls])
+
+    def perf_dump(self, rank=0, status=None):
+        return self.fs.rank_asok(['perf', 'dump'], rank=rank, status=status)
+
+    def wait_until_evicted(self, client_id, timeout=30):
+        def is_client_evicted():
+            ls = self._session_list()
+            for s in ls:
+                if s['id'] == client_id:
+                    return False
+            return True
+        self.wait_until_true(is_client_evicted, timeout)
+
+    def wait_for_daemon_start(self, daemon_ids=None):
+        """
+        Wait until all the daemons appear in the FSMap, either assigned
+        MDS ranks or in the list of standbys
+        """
+        def get_daemon_names():
+            return [info['name'] for info in self.mds_cluster.status().get_all()]
+
+        if daemon_ids is None:
+            daemon_ids = self.mds_cluster.mds_ids
+
+        try:
+            self.wait_until_true(
+                lambda: set(daemon_ids) & set(get_daemon_names()) == set(daemon_ids),
+                timeout=30
+            )
+        except RuntimeError:
+            log.warning("Timeout waiting for daemons {0}, while we have {1}".format(
+                daemon_ids, get_daemon_names()
+            ))
+            raise
+
+    def delete_mds_coredump(self, daemon_id):
+        # delete coredump file, otherwise teuthology.internal.coredump will
+        # catch it later and treat it as a failure.
+        core_pattern = self.mds_cluster.mds_daemons[daemon_id].remote.sh(
+            "sudo sysctl -n kernel.core_pattern")
+        core_dir = os.path.dirname(core_pattern.strip())
+        if core_dir:  # Non-default core_pattern with a directory in it
+            # We have seen a core_pattern that looks like it's from teuthology's coredump
+            # task, so proceed to clear out the core file
+            if core_dir[0] == '|':
+                log.info("Piped core dumps to program {0}, skip cleaning".format(core_dir[1:]))
+                return;
+
+            log.info("Clearing core from directory: {0}".format(core_dir))
+
+            # Verify that we see the expected single coredump
+            ls_output = self.mds_cluster.mds_daemons[daemon_id].remote.sh([
+                "cd", core_dir, run.Raw('&&'),
+                "sudo", "ls", run.Raw('|'), "sudo", "xargs", "file"
+            ])
+            cores = [l.partition(":")[0]
+                     for l in ls_output.strip().split("\n")
+                     if re.match(r'.*ceph-mds.* -i +{0}'.format(daemon_id), l)]
+
+            log.info("Enumerated cores: {0}".format(cores))
+            self.assertEqual(len(cores), 1)
+
+            log.info("Found core file {0}, deleting it".format(cores[0]))
+
+            self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
+                "cd", core_dir, run.Raw('&&'), "sudo", "rm", "-f", cores[0]
+            ])
+        else:
+            log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")
+
+    def _wait_subtrees(self, status, rank, test):
+        timeout = 30
+        pause = 2
+        test = sorted(test)
+        for i in range(timeout // pause):
+            subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, rank)['name'])
+            subtrees = filter(lambda s: s['dir']['path'].startswith('/'), subtrees)
+            filtered = sorted([(s['dir']['path'], s['auth_first']) for s in subtrees])
+            log.info("%s =?= %s", filtered, test)
+            if filtered == test:
+                # Confirm export_pin in output is correct:
+                for s in subtrees:
+                    self.assertTrue(s['export_pin'] == s['auth_first'])
+                return subtrees
+            time.sleep(pause)
+        raise RuntimeError("rank {0} failed to reach desired subtree state", rank)
+
+    def _wait_until_scrub_complete(self, path="/", recursive=True):
+        out_json = self.fs.rank_tell(["scrub", "start", path] + ["recursive"] if recursive else [])
+        with safe_while(sleep=10, tries=10) as proceed:
+            while proceed():
+                out_json = self.fs.rank_tell(["scrub", "status"])
+                if out_json['status'] == "no active scrubs running":
+                    break;
diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py
new file mode 100644
index 00000000..c5531f94
--- /dev/null
+++ b/qa/tasks/cephfs/filesystem.py
@@ -0,0 +1,1386 @@
+
+import json
+import logging
+from gevent import Greenlet
+import os
+import time
+import datetime
+import re
+import errno
+import random
+import traceback
+
+from io import BytesIO
+from io import StringIO
+
+from teuthology.exceptions import CommandFailedError
+from teuthology import misc
+from teuthology.nuke import clear_firewall
+from teuthology.parallel import parallel
+from tasks.ceph_manager import write_conf
+from tasks import ceph_manager
+
+
+log = logging.getLogger(__name__)
+
+
+DAEMON_WAIT_TIMEOUT = 120
+ROOT_INO = 1
+
+class FileLayout(object):
+    def __init__(self, pool=None, pool_namespace=None, stripe_unit=None, stripe_count=None, object_size=None):
+        self.pool = pool
+        self.pool_namespace = pool_namespace
+        self.stripe_unit = stripe_unit
+        self.stripe_count = stripe_count
+        self.object_size = object_size
+
+    @classmethod
+    def load_from_ceph(layout_str):
+        # TODO
+        pass
+
+    def items(self):
+        if self.pool is not None:
+            yield ("pool", self.pool)
+        if self.pool_namespace:
+            yield ("pool_namespace", self.pool_namespace)
+        if self.stripe_unit is not None:
+            yield ("stripe_unit", self.stripe_unit)
+        if self.stripe_count is not None:
+            yield ("stripe_count", self.stripe_count)
+        if self.object_size is not None:
+            yield ("object_size", self.stripe_size)
+
+class ObjectNotFound(Exception):
+    def __init__(self, object_name):
+        self._object_name = object_name
+
+    def __str__(self):
+        return "Object not found: '{0}'".format(self._object_name)
+
+class FSStatus(object):
+    """
+    Operations on a snapshot of the FSMap.
+    """
+    def __init__(self, mon_manager):
+        self.mon = mon_manager
+        self.map = json.loads(self.mon.raw_cluster_cmd("fs", "dump", "--format=json"))
+
+    def __str__(self):
+        return json.dumps(self.map, indent = 2, sort_keys = True)
+
+    # Expose the fsmap for manual inspection.
+    def __getitem__(self, key):
+        """
+        Get a field from the fsmap.
+        """
+        return self.map[key]
+
+    def get_filesystems(self):
+        """
+        Iterator for all filesystems.
+        """
+        for fs in self.map['filesystems']:
+            yield fs
+
+    def get_all(self):
+        """
+        Iterator for all the mds_info components in the FSMap.
+        """
+        for info in self.get_standbys():
+            yield info
+        for fs in self.map['filesystems']:
+            for info in fs['mdsmap']['info'].values():
+                yield info
+
+    def get_standbys(self):
+        """
+        Iterator for all standbys.
+        """
+        for info in self.map['standbys']:
+            yield info
+
+    def get_fsmap(self, fscid):
+        """
+        Get the fsmap for the given FSCID.
+        """
+        for fs in self.map['filesystems']:
+            if fscid is None or fs['id'] == fscid:
+                return fs
+        raise RuntimeError("FSCID {0} not in map".format(fscid))
+
+    def get_fsmap_byname(self, name):
+        """
+        Get the fsmap for the given file system name.
+        """
+        for fs in self.map['filesystems']:
+            if name is None or fs['mdsmap']['fs_name'] == name:
+                return fs
+        raise RuntimeError("FS {0} not in map".format(name))
+
+    def get_replays(self, fscid):
+        """
+        Get the standby:replay MDS for the given FSCID.
+        """
+        fs = self.get_fsmap(fscid)
+        for info in fs['mdsmap']['info'].values():
+            if info['state'] == 'up:standby-replay':
+                yield info
+
+    def get_ranks(self, fscid):
+        """
+        Get the ranks for the given FSCID.
+        """
+        fs = self.get_fsmap(fscid)
+        for info in fs['mdsmap']['info'].values():
+            if info['rank'] >= 0 and info['state'] != 'up:standby-replay':
+                yield info
+
+    def get_rank(self, fscid, rank):
+        """
+        Get the rank for the given FSCID.
+        """
+        for info in self.get_ranks(fscid):
+            if info['rank'] == rank:
+                return info
+        raise RuntimeError("FSCID {0} has no rank {1}".format(fscid, rank))
+
+    def get_mds(self, name):
+        """
+        Get the info for the given MDS name.
+        """
+        for info in self.get_all():
+            if info['name'] == name:
+                return info
+        return None
+
+    def get_mds_addr(self, name):
+        """
+        Return the instance addr as a string, like "10.214.133.138:6807\/10825"
+        """
+        info = self.get_mds(name)
+        if info:
+            return info['addr']
+        else:
+            log.warning(json.dumps(list(self.get_all()), indent=2))  # dump for debugging
+            raise RuntimeError("MDS id '{0}' not found in map".format(name))
+
+class CephCluster(object):
+    @property
+    def admin_remote(self):
+        first_mon = misc.get_first_mon(self._ctx, None)
+        (result,) = self._ctx.cluster.only(first_mon).remotes.keys()
+        return result
+
+    def __init__(self, ctx):
+        self._ctx = ctx
+        self.mon_manager = ceph_manager.CephManager(self.admin_remote, ctx=ctx, logger=log.getChild('ceph_manager'))
+
+    def get_config(self, key, service_type=None):
+        """
+        Get config from mon by default, or a specific service if caller asks for it
+        """
+        if service_type is None:
+            service_type = 'mon'
+
+        service_id = sorted(misc.all_roles_of_type(self._ctx.cluster, service_type))[0]
+        return self.json_asok(['config', 'get', key], service_type, service_id)[key]
+
+    def set_ceph_conf(self, subsys, key, value):
+        if subsys not in self._ctx.ceph['ceph'].conf:
+            self._ctx.ceph['ceph'].conf[subsys] = {}
+        self._ctx.ceph['ceph'].conf[subsys][key] = value
+        write_conf(self._ctx)  # XXX because we don't have the ceph task's config object, if they
+                               # used a different config path this won't work.
+
+    def clear_ceph_conf(self, subsys, key):
+        del self._ctx.ceph['ceph'].conf[subsys][key]
+        write_conf(self._ctx)
+
+    def json_asok(self, command, service_type, service_id, timeout=None):
+        if timeout is None:
+            timeout = 15*60
+        proc = self.mon_manager.admin_socket(service_type, service_id, command, timeout=timeout)
+        response_data = proc.stdout.getvalue()
+        log.info("_json_asok output: {0}".format(response_data))
+        if response_data.strip():
+            return json.loads(response_data)
+        else:
+            return None
+
+
+class MDSCluster(CephCluster):
+    """
+    Collective operations on all the MDS daemons in the Ceph cluster.  These
+    daemons may be in use by various Filesystems.
+
+    For the benefit of pre-multi-filesystem tests, this class is also
+    a parent of Filesystem.  The correct way to use MDSCluster going forward is
+    as a separate instance outside of your (multiple) Filesystem instances.
+    """
+    def __init__(self, ctx):
+        super(MDSCluster, self).__init__(ctx)
+
+        self.mds_ids = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
+
+        if len(self.mds_ids) == 0:
+            raise RuntimeError("This task requires at least one MDS")
+
+        if hasattr(self._ctx, "daemons"):
+            # Presence of 'daemons' attribute implies ceph task rather than ceph_deploy task
+            self.mds_daemons = dict([(mds_id, self._ctx.daemons.get_daemon('mds', mds_id)) for mds_id in self.mds_ids])
+
+    def _one_or_all(self, mds_id, cb, in_parallel=True):
+        """
+        Call a callback for a single named MDS, or for all.
+
+        Note that the parallelism here isn't for performance, it's to avoid being overly kind
+        to the cluster by waiting a graceful ssh-latency of time between doing things, and to
+        avoid being overly kind by executing them in a particular order.  However, some actions
+        don't cope with being done in parallel, so it's optional (`in_parallel`)
+
+        :param mds_id: MDS daemon name, or None
+        :param cb: Callback taking single argument of MDS daemon name
+        :param in_parallel: whether to invoke callbacks concurrently (else one after the other)
+        """
+        if mds_id is None:
+            if in_parallel:
+                with parallel() as p:
+                    for mds_id in self.mds_ids:
+                        p.spawn(cb, mds_id)
+            else:
+                for mds_id in self.mds_ids:
+                    cb(mds_id)
+        else:
+            cb(mds_id)
+
+    def get_config(self, key, service_type=None):
+        """
+        get_config specialization of service_type="mds"
+        """
+        if service_type != "mds":
+            return super(MDSCluster, self).get_config(key, service_type)
+
+        # Some tests stop MDS daemons, don't send commands to a dead one:
+        running_daemons = [i for i, mds in self.mds_daemons.items() if mds.running()]
+        service_id = random.sample(running_daemons, 1)[0]
+        return self.json_asok(['config', 'get', key], service_type, service_id)[key]
+
+    def mds_stop(self, mds_id=None):
+        """
+        Stop the MDS daemon process(se).  If it held a rank, that rank
+        will eventually go laggy.
+        """
+        self._one_or_all(mds_id, lambda id_: self.mds_daemons[id_].stop())
+
+    def mds_fail(self, mds_id=None):
+        """
+        Inform MDSMonitor of the death of the daemon process(es).  If it held
+        a rank, that rank will be relinquished.
+        """
+        self._one_or_all(mds_id, lambda id_: self.mon_manager.raw_cluster_cmd("mds", "fail", id_))
+
+    def mds_restart(self, mds_id=None):
+        self._one_or_all(mds_id, lambda id_: self.mds_daemons[id_].restart())
+
+    def mds_fail_restart(self, mds_id=None):
+        """
+        Variation on restart that includes marking MDSs as failed, so that doing this
+        operation followed by waiting for healthy daemon states guarantees that they
+        have gone down and come up, rather than potentially seeing the healthy states
+        that existed before the restart.
+        """
+        def _fail_restart(id_):
+            self.mds_daemons[id_].stop()
+            self.mon_manager.raw_cluster_cmd("mds", "fail", id_)
+            self.mds_daemons[id_].restart()
+
+        self._one_or_all(mds_id, _fail_restart)
+
+    def mds_signal(self, mds_id, sig, silent=False):
+        """
+        signal a MDS daemon
+        """
+        self.mds_daemons[mds_id].signal(sig, silent);
+
+    def newfs(self, name='cephfs', create=True):
+        return Filesystem(self._ctx, name=name, create=create)
+
+    def status(self):
+        return FSStatus(self.mon_manager)
+
+    def delete_all_filesystems(self):
+        """
+        Remove all filesystems that exist, and any pools in use by them.
+        """
+        pools = json.loads(self.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools']
+        pool_id_name = {}
+        for pool in pools:
+            pool_id_name[pool['pool']] = pool['pool_name']
+
+        # mark cluster down for each fs to prevent churn during deletion
+        status = self.status()
+        for fs in status.get_filesystems():
+            self.mon_manager.raw_cluster_cmd("fs", "fail", str(fs['mdsmap']['fs_name']))
+
+        # get a new copy as actives may have since changed
+        status = self.status()
+        for fs in status.get_filesystems():
+            mdsmap = fs['mdsmap']
+            metadata_pool = pool_id_name[mdsmap['metadata_pool']]
+
+            self.mon_manager.raw_cluster_cmd('fs', 'rm', mdsmap['fs_name'], '--yes-i-really-mean-it')
+            self.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
+                                             metadata_pool, metadata_pool,
+                                             '--yes-i-really-really-mean-it')
+            for data_pool in mdsmap['data_pools']:
+                data_pool = pool_id_name[data_pool]
+                try:
+                    self.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
+                                                     data_pool, data_pool,
+                                                     '--yes-i-really-really-mean-it')
+                except CommandFailedError as e:
+                    if e.exitstatus == 16: # EBUSY, this data pool is used
+                        pass               # by two metadata pools, let the 2nd
+                    else:                  # pass delete it
+                        raise
+
+    def get_standby_daemons(self):
+        return set([s['name'] for s in self.status().get_standbys()])
+
+    def get_mds_hostnames(self):
+        result = set()
+        for mds_id in self.mds_ids:
+            mds_remote = self.mon_manager.find_remote('mds', mds_id)
+            result.add(mds_remote.hostname)
+
+        return list(result)
+
+    def set_clients_block(self, blocked, mds_id=None):
+        """
+        Block (using iptables) client communications to this MDS.  Be careful: if
+        other services are running on this MDS, or other MDSs try to talk to this
+        MDS, their communications may also be blocked as collatoral damage.
+
+        :param mds_id: Optional ID of MDS to block, default to all
+        :return:
+        """
+        da_flag = "-A" if blocked else "-D"
+
+        def set_block(_mds_id):
+            remote = self.mon_manager.find_remote('mds', _mds_id)
+            status = self.status()
+
+            addr = status.get_mds_addr(_mds_id)
+            ip_str, port_str, inst_str = re.match("(.+):(.+)/(.+)", addr).groups()
+
+            remote.run(
+                args=["sudo", "iptables", da_flag, "OUTPUT", "-p", "tcp", "--sport", port_str, "-j", "REJECT", "-m",
+                      "comment", "--comment", "teuthology"])
+            remote.run(
+                args=["sudo", "iptables", da_flag, "INPUT", "-p", "tcp", "--dport", port_str, "-j", "REJECT", "-m",
+                      "comment", "--comment", "teuthology"])
+
+        self._one_or_all(mds_id, set_block, in_parallel=False)
+
+    def clear_firewall(self):
+        clear_firewall(self._ctx)
+
+    def get_mds_info(self, mds_id):
+        return FSStatus(self.mon_manager).get_mds(mds_id)
+
+    def is_pool_full(self, pool_name):
+        pools = json.loads(self.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools']
+        for pool in pools:
+            if pool['pool_name'] == pool_name:
+                return 'full' in pool['flags_names'].split(",")
+
+        raise RuntimeError("Pool not found '{0}'".format(pool_name))
+
+class Filesystem(MDSCluster):
+    """
+    This object is for driving a CephFS filesystem.  The MDS daemons driven by
+    MDSCluster may be shared with other Filesystems.
+    """
+    def __init__(self, ctx, fs_config=None, fscid=None, name=None, create=False,
+                 ec_profile=None):
+        super(Filesystem, self).__init__(ctx)
+
+        self.name = name
+        self.ec_profile = ec_profile
+        self.id = None
+        self.metadata_pool_name = None
+        self.metadata_overlay = False
+        self.data_pool_name = None
+        self.data_pools = None
+        self.fs_config = fs_config
+
+        client_list = list(misc.all_roles_of_type(self._ctx.cluster, 'client'))
+        self.client_id = client_list[0]
+        self.client_remote = list(misc.get_clients(ctx=ctx, roles=["client.{0}".format(self.client_id)]))[0][1]
+
+        if name is not None:
+            if fscid is not None:
+                raise RuntimeError("cannot specify fscid when creating fs")
+            if create and not self.legacy_configured():
+                self.create()
+        else:
+            if fscid is not None:
+                self.id = fscid
+                self.getinfo(refresh = True)
+
+        # Stash a reference to the first created filesystem on ctx, so
+        # that if someone drops to the interactive shell they can easily
+        # poke our methods.
+        if not hasattr(self._ctx, "filesystem"):
+            self._ctx.filesystem = self
+
+    def get_task_status(self, status_key):
+        return self.mon_manager.get_service_task_status("mds", status_key)
+
+    def getinfo(self, refresh = False):
+        status = self.status()
+        if self.id is not None:
+            fsmap = status.get_fsmap(self.id)
+        elif self.name is not None:
+            fsmap = status.get_fsmap_byname(self.name)
+        else:
+            fss = [fs for fs in status.get_filesystems()]
+            if len(fss) == 1:
+                fsmap = fss[0]
+            elif len(fss) == 0:
+                raise RuntimeError("no file system available")
+            else:
+                raise RuntimeError("more than one file system available")
+        self.id = fsmap['id']
+        self.name = fsmap['mdsmap']['fs_name']
+        self.get_pool_names(status = status, refresh = refresh)
+        return status
+
+    def set_metadata_overlay(self, overlay):
+        if self.id is not None:
+            raise RuntimeError("cannot specify fscid when configuring overlay")
+        self.metadata_overlay = overlay
+
+    def deactivate(self, rank):
+        if rank < 0:
+            raise RuntimeError("invalid rank")
+        elif rank == 0:
+            raise RuntimeError("cannot deactivate rank 0")
+        self.mon_manager.raw_cluster_cmd("mds", "deactivate", "%d:%d" % (self.id, rank))
+
+    def reach_max_mds(self):
+        # Try to reach rank count == max_mds, up or down (UPGRADE SENSITIVE!)
+        status = self.getinfo()
+        mds_map = self.get_mds_map(status=status)
+        max_mds = mds_map['max_mds']
+
+        count = len(list(self.get_ranks(status=status)))
+        if count > max_mds:
+            try:
+                # deactivate mds in decending order
+                status = self.wait_for_daemons(status=status, skip_max_mds_check=True)
+                while count > max_mds:
+                    targets = sorted(self.get_ranks(status=status), key=lambda r: r['rank'], reverse=True)
+                    target = targets[0]
+                    log.debug("deactivating rank %d" % target['rank'])
+                    self.deactivate(target['rank'])
+                    status = self.wait_for_daemons(skip_max_mds_check=True)
+                    count = len(list(self.get_ranks(status=status)))
+            except:
+                # In Mimic, deactivation is done automatically:
+                log.info("Error:\n{}".format(traceback.format_exc()))
+                status = self.wait_for_daemons()
+        else:
+            status = self.wait_for_daemons()
+
+        mds_map = self.get_mds_map(status=status)
+        assert(mds_map['max_mds'] == max_mds)
+        assert(mds_map['in'] == list(range(0, max_mds)))
+
+    def fail(self):
+        self.mon_manager.raw_cluster_cmd("fs", "fail", str(self.name))
+
+    def set_var(self, var, *args):
+        a = map(str, args)
+        self.mon_manager.raw_cluster_cmd("fs", "set", self.name, var, *a)
+
+    def set_down(self, down=True):
+        self.set_var("down", str(down).lower())
+
+    def set_joinable(self, joinable=True):
+        self.set_var("joinable", str(joinable).lower())
+
+    def set_max_mds(self, max_mds):
+        self.set_var("max_mds", "%d" % max_mds)
+
+    def set_session_timeout(self, timeout):
+        self.set_var("session_timeout", "%d" % timeout)
+
+    def set_allow_standby_replay(self, yes):
+        self.set_var("allow_standby_replay", str(yes).lower())
+
+    def set_allow_new_snaps(self, yes):
+        self.set_var("allow_new_snaps", str(yes).lower(), '--yes-i-really-mean-it')
+
+    # In Octopus+, the PG count can be omitted to use the default. We keep the
+    # hard-coded value for deployments of Mimic/Nautilus.
+    pgs_per_fs_pool = 8
+
+    def create(self):
+        if self.name is None:
+            self.name = "cephfs"
+        if self.metadata_pool_name is None:
+            self.metadata_pool_name = "{0}_metadata".format(self.name)
+        if self.data_pool_name is None:
+            data_pool_name = "{0}_data".format(self.name)
+        else:
+            data_pool_name = self.data_pool_name
+
+        log.debug("Creating filesystem '{0}'".format(self.name))
+
+        self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
+                                         self.metadata_pool_name, self.pgs_per_fs_pool.__str__())
+        if self.metadata_overlay:
+            self.mon_manager.raw_cluster_cmd('fs', 'new',
+                                             self.name, self.metadata_pool_name, data_pool_name,
+                                             '--allow-dangerous-metadata-overlay')
+        else:
+            if self.ec_profile and 'disabled' not in self.ec_profile:
+                log.debug("EC profile is %s", self.ec_profile)
+                cmd = ['osd', 'erasure-code-profile', 'set', data_pool_name]
+                cmd.extend(self.ec_profile)
+                self.mon_manager.raw_cluster_cmd(*cmd)
+                self.mon_manager.raw_cluster_cmd(
+                    'osd', 'pool', 'create',
+                    data_pool_name, self.pgs_per_fs_pool.__str__(), 'erasure',
+                    data_pool_name)
+                self.mon_manager.raw_cluster_cmd(
+                    'osd', 'pool', 'set',
+                    data_pool_name, 'allow_ec_overwrites', 'true')
+            else:
+                self.mon_manager.raw_cluster_cmd(
+                    'osd', 'pool', 'create',
+                    data_pool_name, self.pgs_per_fs_pool.__str__())
+            self.mon_manager.raw_cluster_cmd('fs', 'new',
+                                             self.name,
+                                             self.metadata_pool_name,
+                                             data_pool_name,
+                                             "--force")
+        self.check_pool_application(self.metadata_pool_name)
+        self.check_pool_application(data_pool_name)
+        # Turn off spurious standby count warnings from modifying max_mds in tests.
+        try:
+            self.mon_manager.raw_cluster_cmd('fs', 'set', self.name, 'standby_count_wanted', '0')
+        except CommandFailedError as e:
+            if e.exitstatus == 22:
+                # standby_count_wanted not available prior to luminous (upgrade tests would fail otherwise)
+                pass
+            else:
+                raise
+
+        if self.fs_config is not None:
+            max_mds = self.fs_config.get('max_mds', 1)
+            if max_mds > 1:
+                self.set_max_mds(max_mds)
+
+            # If absent will use the default value (60 seconds)
+            session_timeout = self.fs_config.get('session_timeout', 60)
+            if session_timeout != 60:
+                self.set_session_timeout(session_timeout)
+
+        self.getinfo(refresh = True)
+
+        
+    def check_pool_application(self, pool_name):
+        osd_map = self.mon_manager.get_osd_dump_json()
+        for pool in osd_map['pools']:
+            if pool['pool_name'] == pool_name:
+                if "application_metadata" in pool:
+                    if not "cephfs" in pool['application_metadata']:
+                        raise RuntimeError("Pool %p does not name cephfs as application!".\
+                                           format(pool_name))
+        
+
+    def __del__(self):
+        if getattr(self._ctx, "filesystem", None) == self:
+            delattr(self._ctx, "filesystem")
+
+    def exists(self):
+        """
+        Whether a filesystem exists in the mon's filesystem list
+        """
+        fs_list = json.loads(self.mon_manager.raw_cluster_cmd('fs', 'ls', '--format=json-pretty'))
+        return self.name in [fs['name'] for fs in fs_list]
+
+    def legacy_configured(self):
+        """
+        Check if a legacy (i.e. pre "fs new") filesystem configuration is present.  If this is
+        the case, the caller should avoid using Filesystem.create
+        """
+        try:
+            out_text = self.mon_manager.raw_cluster_cmd('--format=json-pretty', 'osd', 'lspools')
+            pools = json.loads(out_text)
+            metadata_pool_exists = 'metadata' in [p['poolname'] for p in pools]
+            if metadata_pool_exists:
+                self.metadata_pool_name = 'metadata'
+        except CommandFailedError as e:
+            # For use in upgrade tests, Ceph cuttlefish and earlier don't support
+            # structured output (--format) from the CLI.
+            if e.exitstatus == 22:
+                metadata_pool_exists = True
+            else:
+                raise
+
+        return metadata_pool_exists
+
+    def _df(self):
+        return json.loads(self.mon_manager.raw_cluster_cmd("df", "--format=json-pretty"))
+
+    def get_mds_map(self, status=None):
+        if status is None:
+            status = self.status()
+        return status.get_fsmap(self.id)['mdsmap']
+
+    def get_var(self, var, status=None):
+        return self.get_mds_map(status=status)[var]
+
+    def set_dir_layout(self, mount, path, layout):
+        for name, value in layout.items():
+            mount.run_shell(args=["setfattr", "-n", "ceph.dir.layout."+name, "-v", str(value), path])
+
+    def add_data_pool(self, name, create=True):
+        if create:
+            self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', name, self.pgs_per_fs_pool.__str__())
+        self.mon_manager.raw_cluster_cmd('fs', 'add_data_pool', self.name, name)
+        self.get_pool_names(refresh = True)
+        for poolid, fs_name in self.data_pools.items():
+            if name == fs_name:
+                return poolid
+        raise RuntimeError("could not get just created pool '{0}'".format(name))
+
+    def get_pool_names(self, refresh = False, status = None):
+        if refresh or self.metadata_pool_name is None or self.data_pools is None:
+            if status is None:
+                status = self.status()
+            fsmap = status.get_fsmap(self.id)
+
+            osd_map = self.mon_manager.get_osd_dump_json()
+            id_to_name = {}
+            for p in osd_map['pools']:
+                id_to_name[p['pool']] = p['pool_name']
+
+            self.metadata_pool_name = id_to_name[fsmap['mdsmap']['metadata_pool']]
+            self.data_pools = {}
+            for data_pool in fsmap['mdsmap']['data_pools']:
+                self.data_pools[data_pool] = id_to_name[data_pool]
+
+    def get_data_pool_name(self, refresh = False):
+        if refresh or self.data_pools is None:
+            self.get_pool_names(refresh = True)
+        assert(len(self.data_pools) == 1)
+        return next(iter(self.data_pools.values()))
+
+    def get_data_pool_id(self, refresh = False):
+        """
+        Don't call this if you have multiple data pools
+        :return: integer
+        """
+        if refresh or self.data_pools is None:
+            self.get_pool_names(refresh = True)
+        assert(len(self.data_pools) == 1)
+        return next(iter(self.data_pools.keys()))
+
+    def get_data_pool_names(self, refresh = False):
+        if refresh or self.data_pools is None:
+            self.get_pool_names(refresh = True)
+        return list(self.data_pools.values())
+
+    def get_metadata_pool_name(self):
+        return self.metadata_pool_name
+
+    def set_data_pool_name(self, name):
+        if self.id is not None:
+            raise RuntimeError("can't set filesystem name if its fscid is set")
+        self.data_pool_name = name
+
+    def get_namespace_id(self):
+        return self.id
+
+    def get_pool_df(self, pool_name):
+        """
+        Return a dict like:
+        {u'bytes_used': 0, u'max_avail': 83848701, u'objects': 0, u'kb_used': 0}
+        """
+        for pool_df in self._df()['pools']:
+            if pool_df['name'] == pool_name:
+                return pool_df['stats']
+
+        raise RuntimeError("Pool name '{0}' not found".format(pool_name))
+
+    def get_usage(self):
+        return self._df()['stats']['total_used_bytes']
+
+    def are_daemons_healthy(self, status=None, skip_max_mds_check=False):
+        """
+        Return true if all daemons are in one of active, standby, standby-replay, and
+        at least max_mds daemons are in 'active'.
+
+        Unlike most of Filesystem, this function is tolerant of new-style `fs`
+        commands being missing, because we are part of the ceph installation
+        process during upgrade suites, so must fall back to old style commands
+        when we get an EINVAL on a new style command.
+
+        :return:
+        """
+        # First, check to see that processes haven't exited with an error code
+        for mds in self._ctx.daemons.iter_daemons_of_role('mds'):
+            mds.check_status()
+
+        active_count = 0
+        try:
+            mds_map = self.get_mds_map(status=status)
+        except CommandFailedError as cfe:
+            # Old version, fall back to non-multi-fs commands
+            if cfe.exitstatus == errno.EINVAL:
+                mds_map = json.loads(
+                        self.mon_manager.raw_cluster_cmd('mds', 'dump', '--format=json'))
+            else:
+                raise
+
+        log.debug("are_daemons_healthy: mds map: {0}".format(mds_map))
+
+        for mds_id, mds_status in mds_map['info'].items():
+            if mds_status['state'] not in ["up:active", "up:standby", "up:standby-replay"]:
+                log.warning("Unhealthy mds state {0}:{1}".format(mds_id, mds_status['state']))
+                return False
+            elif mds_status['state'] == 'up:active':
+                active_count += 1
+
+        log.debug("are_daemons_healthy: {0}/{1}".format(
+            active_count, mds_map['max_mds']
+        ))
+
+        if not skip_max_mds_check:
+            if active_count > mds_map['max_mds']:
+                log.debug("are_daemons_healthy: number of actives is greater than max_mds: {0}".format(mds_map))
+                return False
+            elif active_count == mds_map['max_mds']:
+                # The MDSMap says these guys are active, but let's check they really are
+                for mds_id, mds_status in mds_map['info'].items():
+                    if mds_status['state'] == 'up:active':
+                        try:
+                            daemon_status = self.mds_asok(["status"], mds_id=mds_status['name'])
+                        except CommandFailedError as cfe:
+                            if cfe.exitstatus == errno.EINVAL:
+                                # Old version, can't do this check
+                                continue
+                            else:
+                                # MDS not even running
+                                return False
+
+                        if daemon_status['state'] != 'up:active':
+                            # MDS hasn't taken the latest map yet
+                            return False
+
+                return True
+            else:
+                return False
+        else:
+            log.debug("are_daemons_healthy: skipping max_mds check")
+            return True
+
+    def get_daemon_names(self, state=None, status=None):
+        """
+        Return MDS daemon names of those daemons in the given state
+        :param state:
+        :return:
+        """
+        mdsmap = self.get_mds_map(status)
+        result = []
+        for mds_status in sorted(mdsmap['info'].values(),
+                                 key=lambda _: _['rank']):
+            if mds_status['state'] == state or state is None:
+                result.append(mds_status['name'])
+
+        return result
+
+    def get_active_names(self):
+        """
+        Return MDS daemon names of those daemons holding ranks
+        in state up:active
+
+        :return: list of strings like ['a', 'b'], sorted by rank
+        """
+        return self.get_daemon_names("up:active")
+
+    def get_all_mds_rank(self, status=None):
+        mdsmap = self.get_mds_map(status)
+        result = []
+        for mds_status in sorted(mdsmap['info'].values(),
+                                 key=lambda _: _['rank']):
+            if mds_status['rank'] != -1 and mds_status['state'] != 'up:standby-replay':
+                result.append(mds_status['rank'])
+
+        return result
+
+    def get_rank(self, rank=0, status=None):
+        if status is None:
+            status = self.getinfo()
+        return status.get_rank(self.id, rank)
+
+    def rank_restart(self, rank=0, status=None):
+        name = self.get_rank(rank=rank, status=status)['name']
+        self.mds_restart(mds_id=name)
+
+    def rank_signal(self, signal, rank=0, status=None):
+        name = self.get_rank(rank=rank, status=status)['name']
+        self.mds_signal(name, signal)
+
+    def rank_freeze(self, yes, rank=0):
+        self.mon_manager.raw_cluster_cmd("mds", "freeze", "{}:{}".format(self.id, rank), str(yes).lower())
+
+    def rank_fail(self, rank=0):
+        self.mon_manager.raw_cluster_cmd("mds", "fail", "{}:{}".format(self.id, rank))
+
+    def get_ranks(self, status=None):
+        if status is None:
+            status = self.getinfo()
+        return status.get_ranks(self.id)
+
+    def get_replays(self, status=None):
+        if status is None:
+            status = self.getinfo()
+        return status.get_replays(self.id)
+
+    def get_replay(self, rank=0, status=None):
+        for replay in self.get_replays(status=status):
+            if replay['rank'] == rank:
+                return replay
+        return None
+
+    def get_rank_names(self, status=None):
+        """
+        Return MDS daemon names of those daemons holding a rank,
+        sorted by rank.  This includes e.g. up:replay/reconnect
+        as well as active, but does not include standby or
+        standby-replay.
+        """
+        mdsmap = self.get_mds_map(status)
+        result = []
+        for mds_status in sorted(mdsmap['info'].values(),
+                                 key=lambda _: _['rank']):
+            if mds_status['rank'] != -1 and mds_status['state'] != 'up:standby-replay':
+                result.append(mds_status['name'])
+
+        return result
+
+    def wait_for_daemons(self, timeout=None, skip_max_mds_check=False, status=None):
+        """
+        Wait until all daemons are healthy
+        :return:
+        """
+
+        if timeout is None:
+            timeout = DAEMON_WAIT_TIMEOUT
+
+        if status is None:
+            status = self.status()
+
+        elapsed = 0
+        while True:
+            if self.are_daemons_healthy(status=status, skip_max_mds_check=skip_max_mds_check):
+                return status
+            else:
+                time.sleep(1)
+                elapsed += 1
+
+            if elapsed > timeout:
+                log.debug("status = {0}".format(status))
+                raise RuntimeError("Timed out waiting for MDS daemons to become healthy")
+
+            status = self.status()
+
+    def get_lone_mds_id(self):
+        """
+        Get a single MDS ID: the only one if there is only one
+        configured, else the only one currently holding a rank,
+        else raise an error.
+        """
+        if len(self.mds_ids) != 1:
+            alive = self.get_rank_names()
+            if len(alive) == 1:
+                return alive[0]
+            else:
+                raise ValueError("Explicit MDS argument required when multiple MDSs in use")
+        else:
+            return self.mds_ids[0]
+
+    def recreate(self):
+        log.info("Creating new filesystem")
+        self.delete_all_filesystems()
+        self.id = None
+        self.create()
+
+    def put_metadata_object_raw(self, object_id, infile):
+        """
+        Save an object to the metadata pool
+        """
+        temp_bin_path = infile
+        self.client_remote.run(args=[
+            'sudo', os.path.join(self._prefix, 'rados'), '-p', self.metadata_pool_name, 'put', object_id, temp_bin_path
+        ])
+
+    def get_metadata_object_raw(self, object_id):
+        """
+        Retrieve an object from the metadata pool and store it in a file.
+        """
+        temp_bin_path = '/tmp/' + object_id + '.bin'
+
+        self.client_remote.run(args=[
+            'sudo', os.path.join(self._prefix, 'rados'), '-p', self.metadata_pool_name, 'get', object_id, temp_bin_path
+        ])
+
+        return temp_bin_path
+
+    def get_metadata_object(self, object_type, object_id):
+        """
+        Retrieve an object from the metadata pool, pass it through
+        ceph-dencoder to dump it to JSON, and return the decoded object.
+        """
+        temp_bin_path = '/tmp/out.bin'
+
+        self.client_remote.run(args=[
+            'sudo', os.path.join(self._prefix, 'rados'), '-p', self.metadata_pool_name, 'get', object_id, temp_bin_path
+        ])
+
+        dump_json = self.client_remote.sh([
+            'sudo', os.path.join(self._prefix, 'ceph-dencoder'), 'type', object_type, 'import', temp_bin_path, 'decode', 'dump_json'
+        ]).strip()
+        try:
+            dump = json.loads(dump_json)
+        except (TypeError, ValueError):
+            log.error("Failed to decode JSON: '{0}'".format(dump_json))
+            raise
+
+        return dump
+
+    def get_journal_version(self):
+        """
+        Read the JournalPointer and Journal::Header objects to learn the version of
+        encoding in use.
+        """
+        journal_pointer_object = '400.00000000'
+        journal_pointer_dump = self.get_metadata_object("JournalPointer", journal_pointer_object)
+        journal_ino = journal_pointer_dump['journal_pointer']['front']
+
+        journal_header_object = "{0:x}.00000000".format(journal_ino)
+        journal_header_dump = self.get_metadata_object('Journaler::Header', journal_header_object)
+
+        version = journal_header_dump['journal_header']['stream_format']
+        log.debug("Read journal version {0}".format(version))
+
+        return version
+
+    def mds_asok(self, command, mds_id=None, timeout=None):
+        if mds_id is None:
+            mds_id = self.get_lone_mds_id()
+
+        return self.json_asok(command, 'mds', mds_id, timeout=timeout)
+
+    def rank_asok(self, command, rank=0, status=None, timeout=None):
+        info = self.get_rank(rank=rank, status=status)
+        return self.json_asok(command, 'mds', info['name'], timeout=timeout)
+
+    def rank_tell(self, command, rank=0, status=None):
+        info = self.get_rank(rank=rank, status=status)
+        return json.loads(self.mon_manager.raw_cluster_cmd("tell", 'mds.{0}'.format(info['name']), *command))
+
+    def read_cache(self, path, depth=None):
+        cmd = ["dump", "tree", path]
+        if depth is not None:
+            cmd.append(depth.__str__())
+        result = self.mds_asok(cmd)
+        if len(result) == 0:
+            raise RuntimeError("Path not found in cache: {0}".format(path))
+
+        return result
+
+    def wait_for_state(self, goal_state, reject=None, timeout=None, mds_id=None, rank=None):
+        """
+        Block until the MDS reaches a particular state, or a failure condition
+        is met.
+
+        When there are multiple MDSs, succeed when exaclty one MDS is in the
+        goal state, or fail when any MDS is in the reject state.
+
+        :param goal_state: Return once the MDS is in this state
+        :param reject: Fail if the MDS enters this state before the goal state
+        :param timeout: Fail if this many seconds pass before reaching goal
+        :return: number of seconds waited, rounded down to integer
+        """
+
+        started_at = time.time()
+        while True:
+            status = self.status()
+            if rank is not None:
+                try:
+                    mds_info = status.get_rank(self.id, rank)
+                    current_state = mds_info['state'] if mds_info else None
+                    log.debug("Looked up MDS state for mds.{0}: {1}".format(rank, current_state))
+                except:
+                    mdsmap = self.get_mds_map(status=status)
+                    if rank in mdsmap['failed']:
+                        log.debug("Waiting for rank {0} to come back.".format(rank))
+                        current_state = None
+                    else:
+                        raise
+            elif mds_id is not None:
+                # mds_info is None if no daemon with this ID exists in the map
+                mds_info = status.get_mds(mds_id)
+                current_state = mds_info['state'] if mds_info else None
+                log.debug("Looked up MDS state for {0}: {1}".format(mds_id, current_state))
+            else:
+                # In general, look for a single MDS
+                states = [m['state'] for m in status.get_ranks(self.id)]
+                if [s for s in states if s == goal_state] == [goal_state]:
+                    current_state = goal_state
+                elif reject in states:
+                    current_state = reject
+                else:
+                    current_state = None
+                log.debug("mapped states {0} to {1}".format(states, current_state))
+
+            elapsed = time.time() - started_at
+            if current_state == goal_state:
+                log.debug("reached state '{0}' in {1}s".format(current_state, elapsed))
+                return elapsed
+            elif reject is not None and current_state == reject:
+                raise RuntimeError("MDS in reject state {0}".format(current_state))
+            elif timeout is not None and elapsed > timeout:
+                log.error("MDS status at timeout: {0}".format(status.get_fsmap(self.id)))
+                raise RuntimeError(
+                    "Reached timeout after {0} seconds waiting for state {1}, while in state {2}".format(
+                        elapsed, goal_state, current_state
+                    ))
+            else:
+                time.sleep(1)
+
+    def _read_data_xattr(self, ino_no, xattr_name, type, pool):
+        mds_id = self.mds_ids[0]
+        remote = self.mds_daemons[mds_id].remote
+        if pool is None:
+            pool = self.get_data_pool_name()
+
+        obj_name = "{0:x}.00000000".format(ino_no)
+
+        args = [
+            os.path.join(self._prefix, "rados"), "-p", pool, "getxattr", obj_name, xattr_name
+        ]
+        try:
+            proc = remote.run(args=args, stdout=BytesIO())
+        except CommandFailedError as e:
+            log.error(e.__str__())
+            raise ObjectNotFound(obj_name)
+
+        data = proc.stdout.getvalue()
+        dump = remote.sh(
+            [os.path.join(self._prefix, "ceph-dencoder"),
+                                            "type", type,
+                                            "import", "-",
+                                            "decode", "dump_json"],
+            stdin=data,
+            stdout=StringIO()
+        )
+
+        return json.loads(dump.strip())
+
+    def _write_data_xattr(self, ino_no, xattr_name, data, pool=None):
+        """
+        Write to an xattr of the 0th data object of an inode.  Will
+        succeed whether the object and/or xattr already exist or not.
+
+        :param ino_no: integer inode number
+        :param xattr_name: string name of the xattr
+        :param data: byte array data to write to the xattr
+        :param pool: name of data pool or None to use primary data pool
+        :return: None
+        """
+        remote = self.mds_daemons[self.mds_ids[0]].remote
+        if pool is None:
+            pool = self.get_data_pool_name()
+
+        obj_name = "{0:x}.00000000".format(ino_no)
+        args = [
+            os.path.join(self._prefix, "rados"), "-p", pool, "setxattr",
+            obj_name, xattr_name, data
+        ]
+        remote.sh(args)
+
+    def read_backtrace(self, ino_no, pool=None):
+        """
+        Read the backtrace from the data pool, return a dict in the format
+        given by inode_backtrace_t::dump, which is something like:
+
+        ::
+
+            rados -p cephfs_data getxattr 10000000002.00000000 parent > out.bin
+            ceph-dencoder type inode_backtrace_t import out.bin decode dump_json
+
+            { "ino": 1099511627778,
+              "ancestors": [
+                    { "dirino": 1,
+                      "dname": "blah",
+                      "version": 11}],
+              "pool": 1,
+              "old_pools": []}
+
+        :param pool: name of pool to read backtrace from.  If omitted, FS must have only
+                     one data pool and that will be used.
+        """
+        return self._read_data_xattr(ino_no, "parent", "inode_backtrace_t", pool)
+
+    def read_layout(self, ino_no, pool=None):
+        """
+        Read 'layout' xattr of an inode and parse the result, returning a dict like:
+        ::
+            {
+                "stripe_unit": 4194304,
+                "stripe_count": 1,
+                "object_size": 4194304,
+                "pool_id": 1,
+                "pool_ns": "",
+            }
+
+        :param pool: name of pool to read backtrace from.  If omitted, FS must have only
+                     one data pool and that will be used.
+        """
+        return self._read_data_xattr(ino_no, "layout", "file_layout_t", pool)
+
+    def _enumerate_data_objects(self, ino, size):
+        """
+        Get the list of expected data objects for a range, and the list of objects
+        that really exist.
+
+        :return a tuple of two lists of strings (expected, actual)
+        """
+        stripe_size = 1024 * 1024 * 4
+
+        size = max(stripe_size, size)
+
+        want_objects = [
+            "{0:x}.{1:08x}".format(ino, n)
+            for n in range(0, ((size - 1) // stripe_size) + 1)
+        ]
+
+        exist_objects = self.rados(["ls"], pool=self.get_data_pool_name()).split("\n")
+
+        return want_objects, exist_objects
+
+    def data_objects_present(self, ino, size):
+        """
+        Check that *all* the expected data objects for an inode are present in the data pool
+        """
+
+        want_objects, exist_objects = self._enumerate_data_objects(ino, size)
+        missing = set(want_objects) - set(exist_objects)
+
+        if missing:
+            log.debug("Objects missing (ino {0}, size {1}): {2}".format(
+                ino, size, missing
+            ))
+            return False
+        else:
+            log.debug("All objects for ino {0} size {1} found".format(ino, size))
+            return True
+
+    def data_objects_absent(self, ino, size):
+        want_objects, exist_objects = self._enumerate_data_objects(ino, size)
+        present = set(want_objects) & set(exist_objects)
+
+        if present:
+            log.debug("Objects not absent (ino {0}, size {1}): {2}".format(
+                ino, size, present
+            ))
+            return False
+        else:
+            log.debug("All objects for ino {0} size {1} are absent".format(ino, size))
+            return True
+
+    def dirfrag_exists(self, ino, frag):
+        try:
+            self.rados(["stat", "{0:x}.{1:08x}".format(ino, frag)])
+        except CommandFailedError:
+            return False
+        else:
+            return True
+
+    def rados(self, args, pool=None, namespace=None, stdin_data=None,
+              stdin_file=None,
+              stdout_data=None):
+        """
+        Call into the `rados` CLI from an MDS
+        """
+
+        if pool is None:
+            pool = self.get_metadata_pool_name()
+
+        # Doesn't matter which MDS we use to run rados commands, they all
+        # have access to the pools
+        mds_id = self.mds_ids[0]
+        remote = self.mds_daemons[mds_id].remote
+
+        # NB we could alternatively use librados pybindings for this, but it's a one-liner
+        # using the `rados` CLI
+        args = ([os.path.join(self._prefix, "rados"), "-p", pool] +
+                (["--namespace", namespace] if namespace else []) +
+                args)
+
+        if stdin_file is not None:
+            args = ["bash", "-c", "cat " + stdin_file + " | " + " ".join(args)]
+        if stdout_data is None:
+            stdout_data = StringIO()
+
+        p = remote.run(args=args,
+                       stdin=stdin_data,
+                       stdout=stdout_data)
+        return p.stdout.getvalue().strip()
+
+    def list_dirfrag(self, dir_ino):
+        """
+        Read the named object and return the list of omap keys
+
+        :return a list of 0 or more strings
+        """
+
+        dirfrag_obj_name = "{0:x}.00000000".format(dir_ino)
+
+        try:
+            key_list_str = self.rados(["listomapkeys", dirfrag_obj_name])
+        except CommandFailedError as e:
+            log.error(e.__str__())
+            raise ObjectNotFound(dirfrag_obj_name)
+
+        return key_list_str.split("\n") if key_list_str else []
+
+    def erase_metadata_objects(self, prefix):
+        """
+        For all objects in the metadata pool matching the prefix,
+        erase them.
+
+        This O(N) with the number of objects in the pool, so only suitable
+        for use on toy test filesystems.
+        """
+        all_objects = self.rados(["ls"]).split("\n")
+        matching_objects = [o for o in all_objects if o.startswith(prefix)]
+        for o in matching_objects:
+            self.rados(["rm", o])
+
+    def erase_mds_objects(self, rank):
+        """
+        Erase all the per-MDS objects for a particular rank.  This includes
+        inotable, sessiontable, journal
+        """
+
+        def obj_prefix(multiplier):
+            """
+            MDS object naming conventions like rank 1's
+            journal is at 201.***
+            """
+            return "%x." % (multiplier * 0x100 + rank)
+
+        # MDS_INO_LOG_OFFSET
+        self.erase_metadata_objects(obj_prefix(2))
+        # MDS_INO_LOG_BACKUP_OFFSET
+        self.erase_metadata_objects(obj_prefix(3))
+        # MDS_INO_LOG_POINTER_OFFSET
+        self.erase_metadata_objects(obj_prefix(4))
+        # MDSTables & SessionMap
+        self.erase_metadata_objects("mds{rank:d}_".format(rank=rank))
+
+    @property
+    def _prefix(self):
+        """
+        Override this to set a different
+        """
+        return ""
+
+    def _make_rank(self, rank):
+        return "{}:{}".format(self.name, rank)
+
+    def _run_tool(self, tool, args, rank=None, quiet=False):
+        # Tests frequently have [client] configuration that jacks up
+        # the objecter log level (unlikely to be interesting here)
+        # and does not set the mds log level (very interesting here)
+        if quiet:
+            base_args = [os.path.join(self._prefix, tool), '--debug-mds=1', '--debug-objecter=1']
+        else:
+            base_args = [os.path.join(self._prefix, tool), '--debug-mds=4', '--debug-objecter=1']
+
+        if rank is not None:
+            base_args.extend(["--rank", "%s" % str(rank)])
+
+        t1 = datetime.datetime.now()
+        r = self.tool_remote.sh(script=base_args + args, stdout=StringIO()).strip()
+        duration = datetime.datetime.now() - t1
+        log.debug("Ran {0} in time {1}, result:\n{2}".format(
+            base_args + args, duration, r
+        ))
+        return r
+
+    @property
+    def tool_remote(self):
+        """
+        An arbitrary remote to use when invoking recovery tools.  Use an MDS host because
+        it'll definitely have keys with perms to access cephfs metadata pool.  This is public
+        so that tests can use this remote to go get locally written output files from the tools.
+        """
+        mds_id = self.mds_ids[0]
+        return self.mds_daemons[mds_id].remote
+
+    def journal_tool(self, args, rank, quiet=False):
+        """
+        Invoke cephfs-journal-tool with the passed arguments for a rank, and return its stdout
+        """
+        fs_rank = self._make_rank(rank)
+        return self._run_tool("cephfs-journal-tool", args, fs_rank, quiet)
+
+    def table_tool(self, args, quiet=False):
+        """
+        Invoke cephfs-table-tool with the passed arguments, and return its stdout
+        """
+        return self._run_tool("cephfs-table-tool", args, None, quiet)
+
+    def data_scan(self, args, quiet=False, worker_count=1):
+        """
+        Invoke cephfs-data-scan with the passed arguments, and return its stdout
+
+        :param worker_count: if greater than 1, multiple workers will be run
+                             in parallel and the return value will be None
+        """
+
+        workers = []
+
+        for n in range(0, worker_count):
+            if worker_count > 1:
+                # data-scan args first token is a command, followed by args to it.
+                # insert worker arguments after the command.
+                cmd = args[0]
+                worker_args = [cmd] + ["--worker_n", n.__str__(), "--worker_m", worker_count.__str__()] + args[1:]
+            else:
+                worker_args = args
+
+            workers.append(Greenlet.spawn(lambda wargs=worker_args:
+                                          self._run_tool("cephfs-data-scan", wargs, None, quiet)))
+
+        for w in workers:
+            w.get()
+
+        if worker_count == 1:
+            return workers[0].value
+        else:
+            return None
+
+    def is_full(self):
+        return self.is_pool_full(self.get_data_pool_name())
diff --git a/qa/tasks/cephfs/fuse_mount.py b/qa/tasks/cephfs/fuse_mount.py
new file mode 100644
index 00000000..664de4f4
--- /dev/null
+++ b/qa/tasks/cephfs/fuse_mount.py
@@ -0,0 +1,502 @@
+from io import StringIO
+import json
+import time
+import logging
+
+import six
+
+from textwrap import dedent
+
+from teuthology import misc
+from teuthology.contextutil import MaxWhileTries
+from teuthology.orchestra import run
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.mount import CephFSMount
+
+log = logging.getLogger(__name__)
+
+
+class FuseMount(CephFSMount):
+    def __init__(self, ctx, client_config, test_dir, client_id, client_remote):
+        super(FuseMount, self).__init__(ctx, test_dir, client_id, client_remote)
+
+        self.client_config = client_config if client_config else {}
+        self.fuse_daemon = None
+        self._fuse_conn = None
+        self.id = None
+        self.inst = None
+        self.addr = None
+
+    def mount(self, mount_path=None, mount_fs_name=None, mountpoint=None):
+        if mountpoint is not None:
+            self.mountpoint = mountpoint
+        self.setupfs(name=mount_fs_name)
+
+        try:
+            return self._mount(mount_path, mount_fs_name)
+        except RuntimeError:
+            # Catch exceptions by the mount() logic (i.e. not remote command
+            # failures) and ensure the mount is not left half-up.
+            # Otherwise we might leave a zombie mount point that causes
+            # anyone traversing cephtest/ to get hung up on.
+            log.warning("Trying to clean up after failed mount")
+            self.umount_wait(force=True)
+            raise
+
+    def _mount(self, mount_path, mount_fs_name):
+        log.info("Client client.%s config is %s" % (self.client_id, self.client_config))
+
+        daemon_signal = 'kill'
+        if self.client_config.get('coverage') or self.client_config.get('valgrind') is not None:
+            daemon_signal = 'term'
+
+        log.info('Mounting ceph-fuse client.{id} at {remote} {mnt}...'.format(
+            id=self.client_id, remote=self.client_remote, mnt=self.mountpoint))
+
+        self.client_remote.run(args=['mkdir', '-p', self.mountpoint],
+                               timeout=(15*60), cwd=self.test_dir)
+
+        run_cmd = [
+            'sudo',
+            'adjust-ulimits',
+            'ceph-coverage',
+            '{tdir}/archive/coverage'.format(tdir=self.test_dir),
+            'daemon-helper',
+            daemon_signal,
+        ]
+
+        fuse_cmd = ['ceph-fuse', "-f"]
+
+        if mount_path is not None:
+            fuse_cmd += ["--client_mountpoint={0}".format(mount_path)]
+
+        if mount_fs_name is not None:
+            fuse_cmd += ["--client_mds_namespace={0}".format(mount_fs_name)]
+
+        fuse_cmd += [
+            '--name', 'client.{id}'.format(id=self.client_id),
+            # TODO ceph-fuse doesn't understand dash dash '--',
+            self.mountpoint,
+        ]
+
+        cwd = self.test_dir
+        if self.client_config.get('valgrind') is not None:
+            run_cmd = misc.get_valgrind_args(
+                self.test_dir,
+                'client.{id}'.format(id=self.client_id),
+                run_cmd,
+                self.client_config.get('valgrind'),
+            )
+            cwd = None # misc.get_valgrind_args chdir for us
+
+        run_cmd.extend(fuse_cmd)
+
+        def list_connections():
+            from teuthology.misc import get_system_type
+
+            conn_dir = "/sys/fs/fuse/connections"
+
+            self.client_remote.run(args=['sudo', 'modprobe', 'fuse'],
+                                   check_status=False)
+            self.client_remote.run(
+                args=["sudo", "mount", "-t", "fusectl", conn_dir, conn_dir],
+                check_status=False, timeout=(30))
+
+            try:
+                ls_str = self.client_remote.sh("ls " + conn_dir,
+                                               stdout=StringIO(),
+                                               timeout=(15*60)).strip()
+            except CommandFailedError:
+                return []
+
+            if ls_str:
+                return [int(n) for n in ls_str.split("\n")]
+            else:
+                return []
+
+        # Before starting ceph-fuse process, note the contents of
+        # /sys/fs/fuse/connections
+        pre_mount_conns = list_connections()
+        log.info("Pre-mount connections: {0}".format(pre_mount_conns))
+
+        proc = self.client_remote.run(
+            args=run_cmd,
+            cwd=cwd,
+            logger=log.getChild('ceph-fuse.{id}'.format(id=self.client_id)),
+            stdin=run.PIPE,
+            wait=False,
+        )
+        self.fuse_daemon = proc
+
+        # Wait for the connection reference to appear in /sys
+        mount_wait = self.client_config.get('mount_wait', 0)
+        if mount_wait > 0:
+            log.info("Fuse mount waits {0} seconds before checking /sys/".format(mount_wait))
+            time.sleep(mount_wait)            
+        timeout = int(self.client_config.get('mount_timeout', 30))
+        waited = 0
+
+        post_mount_conns = list_connections()
+        while len(post_mount_conns) <= len(pre_mount_conns):
+            if self.fuse_daemon.finished:
+                # Did mount fail?  Raise the CommandFailedError instead of
+                # hitting the "failed to populate /sys/" timeout
+                self.fuse_daemon.wait()
+            time.sleep(1)
+            waited += 1
+            if waited > timeout:
+                raise RuntimeError("Fuse mount failed to populate /sys/ after {0} seconds".format(
+                    waited
+                ))
+            else:
+                post_mount_conns = list_connections()
+
+        log.info("Post-mount connections: {0}".format(post_mount_conns))
+
+        # Record our fuse connection number so that we can use it when
+        # forcing an unmount
+        new_conns = list(set(post_mount_conns) - set(pre_mount_conns))
+        if len(new_conns) == 0:
+            raise RuntimeError("New fuse connection directory not found ({0})".format(new_conns))
+        elif len(new_conns) > 1:
+            raise RuntimeError("Unexpectedly numerous fuse connections {0}".format(new_conns))
+        else:
+            self._fuse_conn = new_conns[0]
+
+        self.gather_mount_info()
+
+    def gather_mount_info(self):
+        status = self.admin_socket(['status'])
+        self.id = status['id']
+        self.client_pid = status['metadata']['pid']
+        try:
+            self.inst = status['inst_str']
+            self.addr = status['addr_str']
+        except KeyError:
+            sessions = self.fs.rank_asok(['session', 'ls'])
+            for s in sessions:
+                if s['id'] == self.id:
+                    self.inst = s['inst']
+                    self.addr = self.inst.split()[1]
+            if self.inst is None:
+                raise RuntimeError("cannot find client session")
+
+    def is_mounted(self):
+        proc = self.client_remote.run(
+            args=[
+                'stat',
+                '--file-system',
+                '--printf=%T\n',
+                '--',
+                self.mountpoint,
+            ],
+            cwd=self.test_dir,
+            stdout=StringIO(),
+            stderr=StringIO(),
+            wait=False,
+            timeout=(15*60)
+        )
+        try:
+            proc.wait()
+        except CommandFailedError:
+            error = proc.stderr.getvalue()
+            if ("endpoint is not connected" in error
+            or "Software caused connection abort" in error):
+                # This happens is fuse is killed without unmount
+                log.warning("Found stale moutn point at {0}".format(self.mountpoint))
+                return True
+            else:
+                # This happens if the mount directory doesn't exist
+                log.info('mount point does not exist: %s', self.mountpoint)
+                return False
+
+        fstype = six.ensure_str(proc.stdout.getvalue()).rstrip('\n')
+        if fstype == 'fuseblk':
+            log.info('ceph-fuse is mounted on %s', self.mountpoint)
+            return True
+        else:
+            log.debug('ceph-fuse not mounted, got fs type {fstype!r}'.format(
+                fstype=fstype))
+            return False
+
+    def wait_until_mounted(self):
+        """
+        Check to make sure that fuse is mounted on mountpoint.  If not,
+        sleep for 5 seconds and check again.
+        """
+
+        while not self.is_mounted():
+            # Even if it's not mounted, it should at least
+            # be running: catch simple failures where it has terminated.
+            assert not self.fuse_daemon.poll()
+
+            time.sleep(5)
+
+        # Now that we're mounted, set permissions so that the rest of the test will have
+        # unrestricted access to the filesystem mount.
+        try:
+            stderr = StringIO()
+            self.client_remote.run(args=['sudo', 'chmod', '1777', self.mountpoint], timeout=(15*60), cwd=self.test_dir, stderr=stderr)
+        except run.CommandFailedError:
+            stderr = stderr.getvalue()
+            if "Read-only file system".lower() in stderr.lower():
+                pass
+            else:
+                raise
+
+    def _mountpoint_exists(self):
+        return self.client_remote.run(args=["ls", "-d", self.mountpoint], check_status=False, cwd=self.test_dir, timeout=(15*60)).exitstatus == 0
+
+    def umount(self):
+        if not self.is_mounted():
+            return
+
+        try:
+            log.info('Running fusermount -u on {name}...'.format(name=self.client_remote.name))
+            self.client_remote.run(
+                args=[
+                    'sudo',
+                    'fusermount',
+                    '-u',
+                    self.mountpoint,
+                ],
+                cwd=self.test_dir,
+                timeout=(30*60),
+            )
+        except run.CommandFailedError:
+            log.info('Failed to unmount ceph-fuse on {name}, aborting...'.format(name=self.client_remote.name))
+
+            self.client_remote.run(args=[
+                'sudo',
+                run.Raw('PATH=/usr/sbin:$PATH'),
+                'lsof',
+                run.Raw(';'),
+                'ps',
+                'auxf',
+            ], timeout=(60*15))
+
+            # abort the fuse mount, killing all hung processes
+            if self._fuse_conn:
+                self.run_python(dedent("""
+                import os
+                path = "/sys/fs/fuse/connections/{0}/abort"
+                if os.path.exists(path):
+                    open(path, "w").write("1")
+                """).format(self._fuse_conn))
+                self._fuse_conn = None
+
+            stderr = StringIO()
+            try:
+                # make sure its unmounted
+                self.client_remote.run(
+                    args=[
+                        'sudo',
+                        'umount',
+                        '-l',
+                        '-f',
+                        self.mountpoint,
+                    ],
+                    stderr=stderr,
+                    timeout=(60*15)
+                )
+            except CommandFailedError:
+                if self.is_mounted():
+                    raise
+
+        assert not self.is_mounted()
+        self._fuse_conn = None
+        self.id = None
+        self.inst = None
+        self.addr = None
+
+    def umount_wait(self, force=False, require_clean=False, timeout=900):
+        """
+        :param force: Complete cleanly even if the MDS is offline
+        """
+        if force:
+            assert not require_clean  # mutually exclusive
+
+            # When we expect to be forcing, kill the ceph-fuse process directly.
+            # This should avoid hitting the more aggressive fallback killing
+            # in umount() which can affect other mounts too.
+            self.fuse_daemon.stdin.close()
+
+            # However, we will still hit the aggressive wait if there is an ongoing
+            # mount -o remount (especially if the remount is stuck because MDSs
+            # are unavailable)
+
+        self.umount()
+
+        try:
+            if self.fuse_daemon:
+                # Permit a timeout, so that we do not block forever
+                run.wait([self.fuse_daemon], timeout)
+        except MaxWhileTries:
+            log.error("process failed to terminate after unmount. This probably"
+                      " indicates a bug within ceph-fuse.")
+            raise
+        except CommandFailedError:
+            if require_clean:
+                raise
+
+        self.cleanup()
+
+    def cleanup(self):
+        """
+        Remove the mount point.
+
+        Prerequisite: the client is not mounted.
+        """
+        stderr = StringIO()
+        try:
+            self.client_remote.run(
+                args=[
+                    'rmdir',
+                    '--',
+                    self.mountpoint,
+                ],
+                cwd=self.test_dir,
+                stderr=stderr,
+                timeout=(60*5),
+                check_status=False,
+            )
+        except CommandFailedError:
+            if "No such file or directory" in stderr.getvalue():
+                pass
+            else:
+                raise
+
+    def kill(self):
+        """
+        Terminate the client without removing the mount point.
+        """
+        log.info('Killing ceph-fuse connection on {name}...'.format(name=self.client_remote.name))
+        self.fuse_daemon.stdin.close()
+        try:
+            self.fuse_daemon.wait()
+        except CommandFailedError:
+            pass
+
+    def kill_cleanup(self):
+        """
+        Follow up ``kill`` to get to a clean unmounted state.
+        """
+        log.info('Cleaning up killed ceph-fuse connection')
+        self.umount()
+        self.cleanup()
+
+    def teardown(self):
+        """
+        Whatever the state of the mount, get it gone.
+        """
+        super(FuseMount, self).teardown()
+
+        self.umount()
+
+        if self.fuse_daemon and not self.fuse_daemon.finished:
+            self.fuse_daemon.stdin.close()
+            try:
+                self.fuse_daemon.wait()
+            except CommandFailedError:
+                pass
+
+        # Indiscriminate, unlike the touchier cleanup()
+        self.client_remote.run(
+            args=[
+                'rm',
+                '-rf',
+                self.mountpoint,
+            ],
+            cwd=self.test_dir,
+            timeout=(60*5)
+        )
+
+    def _asok_path(self):
+        return "/var/run/ceph/ceph-client.{0}.*.asok".format(self.client_id)
+
+    @property
+    def _prefix(self):
+        return ""
+
+    def admin_socket(self, args):
+        pyscript = """
+import glob
+import re
+import os
+import subprocess
+
+def find_socket(client_name):
+        asok_path = "{asok_path}"
+        files = glob.glob(asok_path)
+
+        # Given a non-glob path, it better be there
+        if "*" not in asok_path:
+            assert(len(files) == 1)
+            return files[0]
+
+        for f in files:
+                pid = re.match(".*\.(\d+)\.asok$", f).group(1)
+                if os.path.exists("/proc/{{0}}".format(pid)):
+                        return f
+        raise RuntimeError("Client socket {{0}} not found".format(client_name))
+
+print(find_socket("{client_name}"))
+""".format(
+            asok_path=self._asok_path(),
+            client_name="client.{0}".format(self.client_id))
+
+        # Find the admin socket
+        asok_path = self.client_remote.sh(
+            ['sudo', 'python3', '-c', pyscript],
+            stdout=StringIO(),
+            timeout=(15*60)).strip()
+        log.info("Found client admin socket at {0}".format(asok_path))
+
+        # Query client ID from admin socket
+        json_data = self.client_remote.sh(
+            ['sudo', self._prefix + 'ceph', '--admin-daemon', asok_path] + args,
+            stdout=StringIO(),
+            timeout=(15*60))
+        return json.loads(json_data)
+
+    def get_global_id(self):
+        """
+        Look up the CephFS client ID for this mount
+        """
+        return self.admin_socket(['mds_sessions'])['id']
+
+    def get_global_inst(self):
+        """
+        Look up the CephFS client instance for this mount
+        """
+        return self.inst
+
+    def get_global_addr(self):
+        """
+        Look up the CephFS client addr for this mount
+        """
+        return self.addr
+
+    def get_client_pid(self):
+        """
+        return pid of ceph-fuse process
+        """
+        status = self.admin_socket(['status'])
+        return status['metadata']['pid']
+
+    def get_osd_epoch(self):
+        """
+        Return 2-tuple of osd_epoch, osd_epoch_barrier
+        """
+        status = self.admin_socket(['status'])
+        return status['osd_epoch'], status['osd_epoch_barrier']
+
+    def get_dentry_count(self):
+        """
+        Return 2-tuple of dentry_count, dentry_pinned_count
+        """
+        status = self.admin_socket(['status'])
+        return status['dentry_count'], status['dentry_pinned_count']
+
+    def set_cache_size(self, size):
+        return self.admin_socket(['config', 'set', 'client_cache_size', str(size)])
diff --git a/qa/tasks/cephfs/kernel_mount.py b/qa/tasks/cephfs/kernel_mount.py
new file mode 100644
index 00000000..d027bcfc
--- /dev/null
+++ b/qa/tasks/cephfs/kernel_mount.py
@@ -0,0 +1,260 @@
+import json
+import logging
+import time
+from textwrap import dedent
+from teuthology.orchestra.run import CommandFailedError
+from teuthology import misc
+
+from teuthology.orchestra import remote as orchestra_remote
+from teuthology.orchestra import run
+from teuthology.contextutil import MaxWhileTries
+from tasks.cephfs.mount import CephFSMount
+
+log = logging.getLogger(__name__)
+
+
+UMOUNT_TIMEOUT = 300
+
+
+class KernelMount(CephFSMount):
+    def __init__(self, ctx, test_dir, client_id, client_remote,
+                 ipmi_user, ipmi_password, ipmi_domain):
+        super(KernelMount, self).__init__(ctx, test_dir, client_id, client_remote)
+
+        self.mounted = False
+        self.ipmi_user = ipmi_user
+        self.ipmi_password = ipmi_password
+        self.ipmi_domain = ipmi_domain
+
+    def mount(self, mount_path=None, mount_fs_name=None, mountpoint=None):
+        if mountpoint is not None:
+            self.mountpoint = mountpoint
+        self.setupfs(name=mount_fs_name)
+
+        log.info('Mounting kclient client.{id} at {remote} {mnt}...'.format(
+            id=self.client_id, remote=self.client_remote, mnt=self.mountpoint))
+
+        self.client_remote.run(args=['mkdir', '-p', self.mountpoint],
+                               timeout=(5*60))
+
+        if mount_path is None:
+            mount_path = "/"
+
+        opts = 'name={id},norequire_active_mds,conf={conf}'.format(id=self.client_id,
+                                                        conf=self.config_path)
+
+        if mount_fs_name is not None:
+            opts += ",mds_namespace={0}".format(mount_fs_name)
+
+        self.client_remote.run(
+            args=[
+                'sudo',
+                'adjust-ulimits',
+                'ceph-coverage',
+                '{tdir}/archive/coverage'.format(tdir=self.test_dir),
+                '/bin/mount',
+                '-t',
+                'ceph',
+                ':{mount_path}'.format(mount_path=mount_path),
+                self.mountpoint,
+                '-v',
+                '-o',
+                opts
+            ],
+            timeout=(30*60),
+        )
+
+        self.client_remote.run(
+            args=['sudo', 'chmod', '1777', self.mountpoint], timeout=(5*60))
+
+        self.mounted = True
+
+    def umount(self, force=False):
+        if not self.is_mounted():
+            return
+
+        log.debug('Unmounting client client.{id}...'.format(id=self.client_id))
+
+        cmd=['sudo', 'umount', self.mountpoint]
+        if force:
+            cmd.append('-f')
+
+        try:
+            self.client_remote.run(args=cmd, timeout=(15*60))
+        except Exception as e:
+            self.client_remote.run(args=[
+                'sudo',
+                run.Raw('PATH=/usr/sbin:$PATH'),
+                'lsof',
+                run.Raw(';'),
+                'ps', 'auxf',
+            ], timeout=(15*60))
+            raise e
+
+        rproc = self.client_remote.run(
+            args=[
+                'rmdir',
+                '--',
+                self.mountpoint,
+            ],
+            wait=False
+        )
+        run.wait([rproc], UMOUNT_TIMEOUT)
+        self.mounted = False
+
+    def cleanup(self):
+        pass
+
+    def umount_wait(self, force=False, require_clean=False, timeout=900):
+        """
+        Unlike the fuse client, the kernel client's umount is immediate
+        """
+        if not self.is_mounted():
+            return
+
+        try:
+            self.umount(force)
+        except (CommandFailedError, MaxWhileTries):
+            if not force:
+                raise
+
+            self.kill()
+            self.kill_cleanup()
+
+        self.mounted = False
+
+    def is_mounted(self):
+        return self.mounted
+
+    def wait_until_mounted(self):
+        """
+        Unlike the fuse client, the kernel client is up and running as soon
+        as the initial mount() function returns.
+        """
+        assert self.mounted
+
+    def teardown(self):
+        super(KernelMount, self).teardown()
+        if self.mounted:
+            self.umount()
+
+    def kill(self):
+        """
+        The Ceph kernel client doesn't have a mechanism to kill itself (doing
+        that in side the kernel would be weird anyway), so we reboot the whole node
+        to get the same effect.
+
+        We use IPMI to reboot, because we don't want the client to send any
+        releases of capabilities.
+        """
+
+        con = orchestra_remote.getRemoteConsole(self.client_remote.hostname,
+                                                self.ipmi_user,
+                                                self.ipmi_password,
+                                                self.ipmi_domain)
+        con.hard_reset(wait_for_login=False)
+
+        self.mounted = False
+
+    def kill_cleanup(self):
+        assert not self.mounted
+
+        # We need to do a sleep here because we don't know how long it will
+        # take for a hard_reset to be effected.
+        time.sleep(30)
+
+        try:
+            # Wait for node to come back up after reboot
+            misc.reconnect(None, 300, [self.client_remote])
+        except:
+            # attempt to get some useful debug output:
+            con = orchestra_remote.getRemoteConsole(self.client_remote.hostname,
+                                                    self.ipmi_user,
+                                                    self.ipmi_password,
+                                                    self.ipmi_domain)
+            con.check_status(timeout=60)
+            raise
+
+        # Remove mount directory
+        self.client_remote.run(args=['uptime'], timeout=10)
+
+        # Remove mount directory
+        self.client_remote.run(
+            args=[
+                'rmdir',
+                '--',
+                self.mountpoint,
+            ],
+            timeout=(5*60),
+            check_status=False,
+        )
+
+    def _find_debug_dir(self):
+        """
+        Find the debugfs folder for this mount
+        """
+        pyscript = dedent("""
+            import glob
+            import os
+            import json
+
+            def get_id_to_dir():
+                result = {}
+                for dir in glob.glob("/sys/kernel/debug/ceph/*"):
+                    mds_sessions_lines = open(os.path.join(dir, "mds_sessions")).readlines()
+                    client_id = mds_sessions_lines[1].split()[1].strip('"')
+
+                    result[client_id] = dir
+                return result
+
+            print(json.dumps(get_id_to_dir()))
+            """)
+
+        output = self.client_remote.sh([
+            'sudo', 'python3', '-c', pyscript
+        ], timeout=(5*60))
+        client_id_to_dir = json.loads(output)
+
+        try:
+            return client_id_to_dir[self.client_id]
+        except KeyError:
+            log.error("Client id '{0}' debug dir not found (clients seen were: {1})".format(
+                self.client_id, ",".join(client_id_to_dir.keys())
+            ))
+            raise
+
+    def _read_debug_file(self, filename):
+        debug_dir = self._find_debug_dir()
+
+        pyscript = dedent("""
+            import os
+
+            print(open(os.path.join("{debug_dir}", "{filename}")).read())
+            """).format(debug_dir=debug_dir, filename=filename)
+
+        output = self.client_remote.sh([
+            'sudo', 'python3', '-c', pyscript
+        ], timeout=(5*60))
+        return output
+
+    def get_global_id(self):
+        """
+        Look up the CephFS client ID for this mount, using debugfs.
+        """
+
+        assert self.mounted
+
+        mds_sessions = self._read_debug_file("mds_sessions")
+        lines = mds_sessions.split("\n")
+        return int(lines[0].split()[1])
+
+    def get_osd_epoch(self):
+        """
+        Return 2-tuple of osd_epoch, osd_epoch_barrier
+        """
+        osd_map = self._read_debug_file("osdmap")
+        lines = osd_map.split("\n")
+        first_line_tokens = lines[0].split()
+        epoch, barrier = int(first_line_tokens[1]), int(first_line_tokens[3])
+
+        return epoch, barrier
diff --git a/qa/tasks/cephfs/mount.py b/qa/tasks/cephfs/mount.py
new file mode 100644
index 00000000..d486f1b6
--- /dev/null
+++ b/qa/tasks/cephfs/mount.py
@@ -0,0 +1,728 @@
+from contextlib import contextmanager
+from io import BytesIO
+import json
+import logging
+import datetime
+import six
+import time
+from six import StringIO
+from textwrap import dedent
+import os
+from teuthology.misc import sudo_write_file
+from teuthology.orchestra import run
+from teuthology.orchestra.run import CommandFailedError, ConnectionLostError, Raw
+from tasks.cephfs.filesystem import Filesystem
+
+log = logging.getLogger(__name__)
+
+
+class CephFSMount(object):
+    def __init__(self, ctx, test_dir, client_id, client_remote):
+        """
+        :param test_dir: Global teuthology test dir
+        :param client_id: Client ID, the 'foo' in client.foo
+        :param client_remote: Remote instance for the host where client will run
+        """
+
+        self.ctx = ctx
+        self.test_dir = test_dir
+        self.client_id = client_id
+        self.client_remote = client_remote
+        self.mountpoint_dir_name = 'mnt.{id}'.format(id=self.client_id)
+        self._mountpoint = None
+        self.fs = None
+
+        self.test_files = ['a', 'b', 'c']
+
+        self.background_procs = []
+
+    @property
+    def mountpoint(self):
+        if self._mountpoint == None:
+            self._mountpoint= os.path.join(
+                self.test_dir, '{dir_name}'.format(dir_name=self.mountpoint_dir_name))
+        return self._mountpoint
+
+    @mountpoint.setter
+    def mountpoint(self, path):
+        if not isinstance(path, str):
+            raise RuntimeError('path should be of str type.')
+        self._mountpoint = path
+
+    def is_mounted(self):
+        raise NotImplementedError()
+
+    def setupfs(self, name=None):
+        if name is None and self.fs is not None:
+            # Previous mount existed, reuse the old name
+            name = self.fs.name
+        self.fs = Filesystem(self.ctx, name=name)
+        log.info('Wait for MDS to reach steady state...')
+        self.fs.wait_for_daemons()
+        log.info('Ready to start {}...'.format(type(self).__name__))
+
+    def mount(self, mount_path=None, mount_fs_name=None, mountpoint=None):
+        raise NotImplementedError()
+
+    def umount(self):
+        raise NotImplementedError()
+
+    def umount_wait(self, force=False, require_clean=False):
+        """
+
+        :param force: Expect that the mount will not shutdown cleanly: kill
+                      it hard.
+        :param require_clean: Wait for the Ceph client associated with the
+                              mount (e.g. ceph-fuse) to terminate, and
+                              raise if it doesn't do so cleanly.
+        :return:
+        """
+        raise NotImplementedError()
+
+    def kill_cleanup(self):
+        raise NotImplementedError()
+
+    def kill(self):
+        raise NotImplementedError()
+
+    def cleanup(self):
+        raise NotImplementedError()
+
+    def wait_until_mounted(self):
+        raise NotImplementedError()
+
+    def get_keyring_path(self):
+        return '/etc/ceph/ceph.client.{id}.keyring'.format(id=self.client_id)
+
+    @property
+    def config_path(self):
+        """
+        Path to ceph.conf: override this if you're not a normal systemwide ceph install
+        :return: stringv
+        """
+        return "/etc/ceph/ceph.conf"
+
+    @contextmanager
+    def mounted(self):
+        """
+        A context manager, from an initially unmounted state, to mount
+        this, yield, and then unmount and clean up.
+        """
+        self.mount()
+        self.wait_until_mounted()
+        try:
+            yield
+        finally:
+            self.umount_wait()
+
+    def is_blacklisted(self):
+        addr = self.get_global_addr()
+        blacklist = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "blacklist", "ls", "--format=json"))
+        for b in blacklist:
+            if addr == b["addr"]:
+                return True
+        return False
+
+    def create_files(self):
+        assert(self.is_mounted())
+
+        for suffix in self.test_files:
+            log.info("Creating file {0}".format(suffix))
+            self.client_remote.run(args=[
+                'sudo', 'touch', os.path.join(self.mountpoint, suffix)
+            ])
+
+    def check_files(self):
+        assert(self.is_mounted())
+
+        for suffix in self.test_files:
+            log.info("Checking file {0}".format(suffix))
+            r = self.client_remote.run(args=[
+                'sudo', 'ls', os.path.join(self.mountpoint, suffix)
+            ], check_status=False)
+            if r.exitstatus != 0:
+                raise RuntimeError("Expected file {0} not found".format(suffix))
+
+    def write_file(self, path, data, perms=None):
+        """
+        Write the given data at the given path and set the given perms to the
+        file on the path.
+        """
+        if path.find(self.mountpoint) == -1:
+            path = os.path.join(self.mountpoint, path)
+
+        sudo_write_file(self.client_remote, path, data)
+
+        if perms:
+            self.run_shell(args=f'chmod {perms} {path}')
+
+    def read_file(self, path):
+        """
+        Return the data from the file on given path.
+        """
+        if path.find(self.mountpoint) == -1:
+            path = os.path.join(self.mountpoint, path)
+
+        return self.run_shell(args=['sudo', 'cat', path], omit_sudo=False).\
+            stdout.getvalue().strip()
+
+    def create_destroy(self):
+        assert(self.is_mounted())
+
+        filename = "{0} {1}".format(datetime.datetime.now(), self.client_id)
+        log.debug("Creating test file {0}".format(filename))
+        self.client_remote.run(args=[
+            'sudo', 'touch', os.path.join(self.mountpoint, filename)
+        ])
+        log.debug("Deleting test file {0}".format(filename))
+        self.client_remote.run(args=[
+            'sudo', 'rm', '-f', os.path.join(self.mountpoint, filename)
+        ])
+
+    def _run_python(self, pyscript, py_version='python3'):
+        return self.client_remote.run(
+               args=['sudo', 'adjust-ulimits', 'daemon-helper', 'kill',
+                     py_version, '-c', pyscript], wait=False, stdin=run.PIPE,
+               stdout=StringIO())
+
+    def run_python(self, pyscript, py_version='python3'):
+        p = self._run_python(pyscript, py_version)
+        p.wait()
+        return six.ensure_str(p.stdout.getvalue().strip())
+
+    def run_shell(self, args, wait=True, check_status=True, omit_sudo=True):
+        if isinstance(args, str):
+            args = args.split()
+
+        args = ["cd", self.mountpoint, run.Raw('&&'), "sudo"] + args
+        return self.client_remote.run(args=args, stdout=StringIO(),
+                                      stderr=StringIO(), wait=wait,
+                                      check_status=check_status,
+                                      omit_sudo=omit_sudo)
+
+    def run_shell_payload(self, payload, **kwargs):
+        return self.run_shell(["bash", "-c", Raw(f"'{payload}'")], **kwargs)
+
+    def open_no_data(self, basename):
+        """
+        A pure metadata operation
+        """
+        assert(self.is_mounted())
+
+        path = os.path.join(self.mountpoint, basename)
+
+        p = self._run_python(dedent(
+            """
+            f = open("{path}", 'w')
+            """.format(path=path)
+        ))
+        p.wait()
+
+    def open_background(self, basename="background_file", write=True):
+        """
+        Open a file for writing, then block such that the client
+        will hold a capability.
+
+        Don't return until the remote process has got as far as opening
+        the file, then return the RemoteProcess instance.
+        """
+        assert(self.is_mounted())
+
+        path = os.path.join(self.mountpoint, basename)
+
+        if write:
+            pyscript = dedent("""
+                import time
+
+                with open("{path}", 'w') as f:
+                    f.write('content')
+                    f.flush()
+                    f.write('content2')
+                    while True:
+                        time.sleep(1)
+                """).format(path=path)
+        else:
+            pyscript = dedent("""
+                import time
+
+                with open("{path}", 'r') as f:
+                    while True:
+                        time.sleep(1)
+                """).format(path=path)
+
+        rproc = self._run_python(pyscript)
+        self.background_procs.append(rproc)
+
+        # This wait would not be sufficient if the file had already
+        # existed, but it's simple and in practice users of open_background
+        # are not using it on existing files.
+        self.wait_for_visible(basename)
+
+        return rproc
+
+    def wait_for_dir_empty(self, dirname, timeout=30):
+        i = 0
+        dirpath = os.path.join(self.mountpoint, dirname)
+        while i < timeout:
+            nr_entries = int(self.getfattr(dirpath, "ceph.dir.entries"))
+            if nr_entries == 0:
+                log.debug("Directory {0} seen empty from {1} after {2}s ".format(
+                    dirname, self.client_id, i))
+                return
+            else:
+                time.sleep(1)
+                i += 1
+
+        raise RuntimeError("Timed out after {0}s waiting for {1} to become empty from {2}".format(
+            i, dirname, self.client_id))
+
+    def wait_for_visible(self, basename="background_file", timeout=30):
+        i = 0
+        while i < timeout:
+            r = self.client_remote.run(args=[
+                'sudo', 'ls', os.path.join(self.mountpoint, basename)
+            ], check_status=False)
+            if r.exitstatus == 0:
+                log.debug("File {0} became visible from {1} after {2}s".format(
+                    basename, self.client_id, i))
+                return
+            else:
+                time.sleep(1)
+                i += 1
+
+        raise RuntimeError("Timed out after {0}s waiting for {1} to become visible from {2}".format(
+            i, basename, self.client_id))
+
+    def lock_background(self, basename="background_file", do_flock=True):
+        """
+        Open and lock a files for writing, hold the lock in a background process
+        """
+        assert(self.is_mounted())
+
+        path = os.path.join(self.mountpoint, basename)
+
+        script_builder = """
+            import time
+            import fcntl
+            import struct"""
+        if do_flock:
+            script_builder += """
+            f1 = open("{path}-1", 'w')
+            fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB)"""
+        script_builder += """
+            f2 = open("{path}-2", 'w')
+            lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
+            fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
+            while True:
+                time.sleep(1)
+            """
+
+        pyscript = dedent(script_builder).format(path=path)
+
+        log.info("lock_background file {0}".format(basename))
+        rproc = self._run_python(pyscript)
+        self.background_procs.append(rproc)
+        return rproc
+
+    def lock_and_release(self, basename="background_file"):
+        assert(self.is_mounted())
+
+        path = os.path.join(self.mountpoint, basename)
+
+        script = """
+            import time
+            import fcntl
+            import struct
+            f1 = open("{path}-1", 'w')
+            fcntl.flock(f1, fcntl.LOCK_EX)
+            f2 = open("{path}-2", 'w')
+            lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
+            fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
+            """
+        pyscript = dedent(script).format(path=path)
+
+        log.info("lock_and_release file {0}".format(basename))
+        return self._run_python(pyscript)
+
+    def check_filelock(self, basename="background_file", do_flock=True):
+        assert(self.is_mounted())
+
+        path = os.path.join(self.mountpoint, basename)
+
+        script_builder = """
+            import fcntl
+            import errno
+            import struct"""
+        if do_flock:
+            script_builder += """
+            f1 = open("{path}-1", 'r')
+            try:
+                fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB)
+            except IOError as e:
+                if e.errno == errno.EAGAIN:
+                    pass
+            else:
+                raise RuntimeError("flock on file {path}-1 not found")"""
+        script_builder += """
+            f2 = open("{path}-2", 'r')
+            try:
+                lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
+                fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
+            except IOError as e:
+                if e.errno == errno.EAGAIN:
+                    pass
+            else:
+                raise RuntimeError("posix lock on file {path}-2 not found")
+            """
+        pyscript = dedent(script_builder).format(path=path)
+
+        log.info("check lock on file {0}".format(basename))
+        self.client_remote.run(args=[
+            'sudo', 'python3', '-c', pyscript
+        ])
+
+    def write_background(self, basename="background_file", loop=False):
+        """
+        Open a file for writing, complete as soon as you can
+        :param basename:
+        :return:
+        """
+        assert(self.is_mounted())
+
+        path = os.path.join(self.mountpoint, basename)
+
+        pyscript = dedent("""
+            import os
+            import time
+
+            fd = os.open("{path}", os.O_RDWR | os.O_CREAT, 0o644)
+            try:
+                while True:
+                    os.write(fd, b'content')
+                    time.sleep(1)
+                    if not {loop}:
+                        break
+            except IOError as e:
+                pass
+            os.close(fd)
+            """).format(path=path, loop=str(loop))
+
+        rproc = self._run_python(pyscript)
+        self.background_procs.append(rproc)
+        return rproc
+
+    def write_n_mb(self, filename, n_mb, seek=0, wait=True):
+        """
+        Write the requested number of megabytes to a file
+        """
+        assert(self.is_mounted())
+
+        return self.run_shell(["dd", "if=/dev/urandom", "of={0}".format(filename),
+                               "bs=1M", "conv=fdatasync",
+                               "count={0}".format(int(n_mb)),
+                               "seek={0}".format(int(seek))
+                               ], wait=wait)
+
+    def write_test_pattern(self, filename, size):
+        log.info("Writing {0} bytes to {1}".format(size, filename))
+        return self.run_python(dedent("""
+            import zlib
+            path = "{path}"
+            with open(path, 'w') as f:
+                for i in range(0, {size}):
+                    val = zlib.crc32(str(i).encode('utf-8')) & 7
+                    f.write(chr(val))
+        """.format(
+            path=os.path.join(self.mountpoint, filename),
+            size=size
+        )))
+
+    def validate_test_pattern(self, filename, size):
+        log.info("Validating {0} bytes from {1}".format(size, filename))
+        return self.run_python(dedent("""
+            import zlib
+            path = "{path}"
+            with open(path, 'r') as f:
+                bytes = f.read()
+            if len(bytes) != {size}:
+                raise RuntimeError("Bad length {{0}} vs. expected {{1}}".format(
+                    len(bytes), {size}
+                ))
+            for i, b in enumerate(bytes):
+                val = zlib.crc32(str(i).encode('utf-8')) & 7
+                if b != chr(val):
+                    raise RuntimeError("Bad data at offset {{0}}".format(i))
+        """.format(
+            path=os.path.join(self.mountpoint, filename),
+            size=size
+        )))
+
+    def open_n_background(self, fs_path, count):
+        """
+        Open N files for writing, hold them open in a background process
+
+        :param fs_path: Path relative to CephFS root, e.g. "foo/bar"
+        :return: a RemoteProcess
+        """
+        assert(self.is_mounted())
+
+        abs_path = os.path.join(self.mountpoint, fs_path)
+
+        pyscript = dedent("""
+            import sys
+            import time
+            import os
+
+            n = {count}
+            abs_path = "{abs_path}"
+
+            if not os.path.exists(abs_path):
+                os.makedirs(abs_path)
+
+            handles = []
+            for i in range(0, n):
+                fname = "file_"+str(i)
+                path = os.path.join(abs_path, fname)
+                handles.append(open(path, 'w'))
+
+            while True:
+                time.sleep(1)
+            """).format(abs_path=abs_path, count=count)
+
+        rproc = self._run_python(pyscript)
+        self.background_procs.append(rproc)
+        return rproc
+
+    def create_n_files(self, fs_path, count, sync=False):
+        assert(self.is_mounted())
+
+        abs_path = os.path.join(self.mountpoint, fs_path)
+
+        pyscript = dedent("""
+            import sys
+            import time
+            import os
+
+            n = {count}
+            abs_path = "{abs_path}"
+
+            if not os.path.exists(os.path.dirname(abs_path)):
+                os.makedirs(os.path.dirname(abs_path))
+
+            for i in range(0, n):
+                fname = "{{0}}_{{1}}".format(abs_path, i)
+                with open(fname, 'w') as f:
+                    f.write('content')
+                    if {sync}:
+                        f.flush()
+                        os.fsync(f.fileno())
+            """).format(abs_path=abs_path, count=count, sync=str(sync))
+
+        self.run_python(pyscript)
+
+    def teardown(self):
+        for p in self.background_procs:
+            log.info("Terminating background process")
+            self._kill_background(p)
+
+        self.background_procs = []
+
+    def _kill_background(self, p):
+        if p.stdin:
+            p.stdin.close()
+            try:
+                p.wait()
+            except (CommandFailedError, ConnectionLostError):
+                pass
+
+    def kill_background(self, p):
+        """
+        For a process that was returned by one of the _background member functions,
+        kill it hard.
+        """
+        self._kill_background(p)
+        self.background_procs.remove(p)
+
+    def send_signal(self, signal):
+        signal = signal.lower()
+        if signal.lower() not in ['sigstop', 'sigcont', 'sigterm', 'sigkill']:
+            raise NotImplementedError
+
+        self.client_remote.run(args=['sudo', 'kill', '-{0}'.format(signal),
+                                self.client_pid], omit_sudo=False)
+
+    def get_global_id(self):
+        raise NotImplementedError()
+
+    def get_global_inst(self):
+        raise NotImplementedError()
+
+    def get_global_addr(self):
+        raise NotImplementedError()
+
+    def get_osd_epoch(self):
+        raise NotImplementedError()
+
+    def stat(self, fs_path, wait=True):
+        """
+        stat a file, and return the result as a dictionary like this:
+        {
+          "st_ctime": 1414161137.0,
+          "st_mtime": 1414161137.0,
+          "st_nlink": 33,
+          "st_gid": 0,
+          "st_dev": 16777218,
+          "st_size": 1190,
+          "st_ino": 2,
+          "st_uid": 0,
+          "st_mode": 16877,
+          "st_atime": 1431520593.0
+        }
+
+        Raises exception on absent file.
+        """
+        abs_path = os.path.join(self.mountpoint, fs_path)
+
+        pyscript = dedent("""
+            import os
+            import stat
+            import json
+            import sys
+
+            try:
+                s = os.stat("{path}")
+            except OSError as e:
+                sys.exit(e.errno)
+
+            attrs = ["st_mode", "st_ino", "st_dev", "st_nlink", "st_uid", "st_gid", "st_size", "st_atime", "st_mtime", "st_ctime"]
+            print(json.dumps(
+                dict([(a, getattr(s, a)) for a in attrs]),
+                indent=2))
+            """).format(path=abs_path)
+        proc = self._run_python(pyscript)
+        if wait:
+            proc.wait()
+            return json.loads(proc.stdout.getvalue().strip())
+        else:
+            return proc
+
+    def touch(self, fs_path):
+        """
+        Create a dentry if it doesn't already exist.  This python
+        implementation exists because the usual command line tool doesn't
+        pass through error codes like EIO.
+
+        :param fs_path:
+        :return:
+        """
+        abs_path = os.path.join(self.mountpoint, fs_path)
+        pyscript = dedent("""
+            import sys
+            import errno
+
+            try:
+                f = open("{path}", "w")
+                f.close()
+            except IOError as e:
+                sys.exit(errno.EIO)
+            """).format(path=abs_path)
+        proc = self._run_python(pyscript)
+        proc.wait()
+
+    def path_to_ino(self, fs_path, follow_symlinks=True):
+        abs_path = os.path.join(self.mountpoint, fs_path)
+
+        if follow_symlinks:
+            pyscript = dedent("""
+                import os
+                import stat
+
+                print(os.stat("{path}").st_ino)
+                """).format(path=abs_path)
+        else:
+            pyscript = dedent("""
+                import os
+                import stat
+
+                print(os.lstat("{path}").st_ino)
+                """).format(path=abs_path)
+
+        proc = self._run_python(pyscript)
+        proc.wait()
+        return int(proc.stdout.getvalue().strip())
+
+    def path_to_nlink(self, fs_path):
+        abs_path = os.path.join(self.mountpoint, fs_path)
+
+        pyscript = dedent("""
+            import os
+            import stat
+
+            print(os.stat("{path}").st_nlink)
+            """).format(path=abs_path)
+
+        proc = self._run_python(pyscript)
+        proc.wait()
+        return int(proc.stdout.getvalue().strip())
+
+    def ls(self, path=None):
+        """
+        Wrap ls: return a list of strings
+        """
+        cmd = ["ls"]
+        if path:
+            cmd.append(path)
+
+        ls_text = self.run_shell(cmd).stdout.getvalue().strip()
+
+        if ls_text:
+            return ls_text.split("\n")
+        else:
+            # Special case because otherwise split on empty string
+            # gives you [''] instead of []
+            return []
+
+    def setfattr(self, path, key, val):
+        """
+        Wrap setfattr.
+
+        :param path: relative to mount point
+        :param key: xattr name
+        :param val: xattr value
+        :return: None
+        """
+        self.run_shell(["setfattr", "-n", key, "-v", val, path])
+
+    def getfattr(self, path, attr):
+        """
+        Wrap getfattr: return the values of a named xattr on one file, or
+        None if the attribute is not found.
+
+        :return: a string
+        """
+        p = self.run_shell(["getfattr", "--only-values", "-n", attr, path], wait=False)
+        try:
+            p.wait()
+        except CommandFailedError as e:
+            if e.exitstatus == 1 and "No such attribute" in p.stderr.getvalue():
+                return None
+            else:
+                raise
+
+        return str(p.stdout.getvalue())
+
+    def df(self):
+        """
+        Wrap df: return a dict of usage fields in bytes
+        """
+
+        p = self.run_shell(["df", "-B1", "."])
+        lines = p.stdout.getvalue().strip().split("\n")
+        fs, total, used, avail = lines[1].split()[:4]
+        log.warning(lines)
+
+        return {
+            "total": int(total),
+            "used": int(used),
+            "available": int(avail)
+        }
diff --git a/qa/tasks/cephfs/test_admin.py b/qa/tasks/cephfs/test_admin.py
new file mode 100644
index 00000000..e4ce5570
--- /dev/null
+++ b/qa/tasks/cephfs/test_admin.py
@@ -0,0 +1,229 @@
+import json
+
+from teuthology.orchestra.run import CommandFailedError
+
+from unittest import case
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from tasks.cephfs.fuse_mount import FuseMount
+
+from tasks.cephfs.filesystem import FileLayout
+
+class TestAdminCommands(CephFSTestCase):
+    """
+    Tests for administration command.
+    """
+
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 1
+
+    def test_fs_status(self):
+        """
+        That `ceph fs status` command functions.
+        """
+
+        s = self.fs.mon_manager.raw_cluster_cmd("fs", "status")
+        self.assertTrue("active" in s)
+
+    def _setup_ec_pools(self, n, metadata=True, overwrites=True):
+        if metadata:
+            self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', n+"-meta", "8")
+        cmd = ['osd', 'erasure-code-profile', 'set', n+"-profile", "m=2", "k=2", "crush-failure-domain=osd"]
+        self.fs.mon_manager.raw_cluster_cmd(*cmd)
+        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', n+"-data", "8", "erasure", n+"-profile")
+        if overwrites:
+            self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'set', n+"-data", 'allow_ec_overwrites', 'true')
+
+    def _check_pool_application_metadata_key_value(self, pool, app, key, value):
+        output = self.fs.mon_manager.raw_cluster_cmd(
+            'osd', 'pool', 'application', 'get', pool, app, key)
+        self.assertEqual(str(output.strip()), value)
+
+    def test_add_data_pool_root(self):
+        """
+        That a new data pool can be added and used for the root directory.
+        """
+
+        p = self.fs.add_data_pool("foo")
+        self.fs.set_dir_layout(self.mount_a, ".", FileLayout(pool=p))
+
+    def test_add_data_pool_application_metadata(self):
+        """
+        That the application metadata set on a newly added data pool is as expected.
+        """
+        pool_name = "foo"
+        mon_cmd = self.fs.mon_manager.raw_cluster_cmd
+        mon_cmd('osd', 'pool', 'create', pool_name, str(self.fs.pgs_per_fs_pool))
+        # Check whether https://tracker.ceph.com/issues/43061 is fixed
+        mon_cmd('osd', 'pool', 'application', 'enable', pool_name, 'cephfs')
+        self.fs.add_data_pool(pool_name, create=False)
+        self._check_pool_application_metadata_key_value(
+            pool_name, 'cephfs', 'data', self.fs.name)
+
+    def test_add_data_pool_subdir(self):
+        """
+        That a new data pool can be added and used for a sub-directory.
+        """
+
+        p = self.fs.add_data_pool("foo")
+        self.mount_a.run_shell(["mkdir", "subdir"])
+        self.fs.set_dir_layout(self.mount_a, "subdir", FileLayout(pool=p))
+
+    def test_add_data_pool_non_alphamueric_name_as_subdir(self):
+        """
+        That a new data pool with non-alphanumeric name can be added and used for a sub-directory.
+        """
+        p = self.fs.add_data_pool("I-am-data_pool00.")
+        self.mount_a.run_shell("mkdir subdir")
+        self.fs.set_dir_layout(self.mount_a, "subdir", FileLayout(pool=p))
+
+    def test_add_data_pool_ec(self):
+        """
+        That a new EC data pool can be added.
+        """
+
+        n = "test_add_data_pool_ec"
+        self._setup_ec_pools(n, metadata=False)
+        p = self.fs.add_data_pool(n+"-data", create=False)
+
+    def test_new_default_ec(self):
+        """
+        That a new file system warns/fails with an EC default data pool.
+        """
+
+        self.fs.delete_all_filesystems()
+        n = "test_new_default_ec"
+        self._setup_ec_pools(n)
+        try:
+            self.fs.mon_manager.raw_cluster_cmd('fs', 'new', n, n+"-meta", n+"-data")
+        except CommandFailedError as e:
+            if e.exitstatus == 22:
+                pass
+            else:
+                raise
+        else:
+            raise RuntimeError("expected failure")
+
+    def test_new_default_ec_force(self):
+        """
+        That a new file system succeeds with an EC default data pool with --force.
+        """
+
+        self.fs.delete_all_filesystems()
+        n = "test_new_default_ec_force"
+        self._setup_ec_pools(n)
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'new', n, n+"-meta", n+"-data", "--force")
+
+    def test_new_default_ec_no_overwrite(self):
+        """
+        That a new file system fails with an EC default data pool without overwrite.
+        """
+
+        self.fs.delete_all_filesystems()
+        n = "test_new_default_ec_no_overwrite"
+        self._setup_ec_pools(n, overwrites=False)
+        try:
+            self.fs.mon_manager.raw_cluster_cmd('fs', 'new', n, n+"-meta", n+"-data")
+        except CommandFailedError as e:
+            if e.exitstatus == 22:
+                pass
+            else:
+                raise
+        else:
+            raise RuntimeError("expected failure")
+        # and even with --force !
+        try:
+            self.fs.mon_manager.raw_cluster_cmd('fs', 'new', n, n+"-meta", n+"-data", "--force")
+        except CommandFailedError as e:
+            if e.exitstatus == 22:
+                pass
+            else:
+                raise
+        else:
+            raise RuntimeError("expected failure")
+
+    def test_fs_new_pool_application_metadata(self):
+        """
+        That the application metadata set on the pools of a newly created filesystem are as expected.
+        """
+        self.fs.delete_all_filesystems()
+        fs_name = "test_fs_new_pool_application"
+        keys = ['metadata', 'data']
+        pool_names = [fs_name+'-'+key for key in keys]
+        mon_cmd = self.fs.mon_manager.raw_cluster_cmd
+        for p in pool_names:
+            mon_cmd('osd', 'pool', 'create', p, str(self.fs.pgs_per_fs_pool))
+            mon_cmd('osd', 'pool', 'application', 'enable', p, 'cephfs')
+        mon_cmd('fs', 'new', fs_name, pool_names[0], pool_names[1])
+        for i in range(2):
+            self._check_pool_application_metadata_key_value(
+                pool_names[i], 'cephfs', keys[i], fs_name)
+
+
+class TestConfigCommands(CephFSTestCase):
+    """
+    Test that daemons and clients respond to the otherwise rarely-used
+    runtime config modification operations.
+    """
+
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 1
+
+    def test_ceph_config_show(self):
+        """
+        That I can successfully show MDS configuration.
+        """
+
+        names = self.fs.get_rank_names()
+        for n in names:
+            s = self.fs.mon_manager.raw_cluster_cmd("config", "show", "mds."+n)
+            self.assertTrue("NAME" in s)
+            self.assertTrue("mon_host" in s)
+
+    def test_client_config(self):
+        """
+        That I can successfully issue asok "config set" commands
+
+        :return:
+        """
+
+        if not isinstance(self.mount_a, FuseMount):
+            raise case.SkipTest("Test only applies to FUSE clients")
+
+        test_key = "client_cache_size"
+        test_val = "123"
+        self.mount_a.admin_socket(['config', 'set', test_key, test_val])
+        out = self.mount_a.admin_socket(['config', 'get', test_key])
+        self.assertEqual(out[test_key], test_val)
+
+        self.mount_a.write_n_mb("file.bin", 1);
+
+        # Implicitly asserting that things don't have lockdep error in shutdown
+        self.mount_a.umount_wait(require_clean=True)
+        self.fs.mds_stop()
+
+    def test_mds_config_asok(self):
+        test_key = "mds_max_purge_ops"
+        test_val = "123"
+        self.fs.mds_asok(['config', 'set', test_key, test_val])
+        out = self.fs.mds_asok(['config', 'get', test_key])
+        self.assertEqual(out[test_key], test_val)
+
+        # Implicitly asserting that things don't have lockdep error in shutdown
+        self.mount_a.umount_wait(require_clean=True)
+        self.fs.mds_stop()
+
+    def test_mds_config_tell(self):
+        test_key = "mds_max_purge_ops"
+        test_val = "123"
+
+        mds_id = self.fs.get_lone_mds_id()
+        self.fs.mon_manager.raw_cluster_cmd("tell", "mds.{0}".format(mds_id), "injectargs",
+                                            "--{0}={1}".format(test_key, test_val))
+
+        # Read it back with asok because there is no `tell` equivalent
+        out = self.fs.mds_asok(['config', 'get', test_key])
+        self.assertEqual(out[test_key], test_val)
+
+        # Implicitly asserting that things don't have lockdep error in shutdown
+        self.mount_a.umount_wait(require_clean=True)
+        self.fs.mds_stop()
diff --git a/qa/tasks/cephfs/test_auto_repair.py b/qa/tasks/cephfs/test_auto_repair.py
new file mode 100644
index 00000000..c0aa2e4c
--- /dev/null
+++ b/qa/tasks/cephfs/test_auto_repair.py
@@ -0,0 +1,90 @@
+
+"""
+Exercise the MDS's auto repair functions
+"""
+
+import logging
+import time
+
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+
+log = logging.getLogger(__name__)
+
+
+# Arbitrary timeouts for operations involving restarting
+# an MDS or waiting for it to come up
+MDS_RESTART_GRACE = 60
+
+
+class TestMDSAutoRepair(CephFSTestCase):
+    def test_backtrace_repair(self):
+        """
+        MDS should verify/fix backtrace on fetch dirfrag
+        """
+
+        self.mount_a.run_shell(["mkdir", "testdir1"])
+        self.mount_a.run_shell(["touch", "testdir1/testfile"])
+        dir_objname = "{:x}.00000000".format(self.mount_a.path_to_ino("testdir1"))
+
+        # drop inodes caps
+        self.mount_a.umount_wait()
+
+        # flush journal entries to dirfrag objects, and expire journal
+        self.fs.mds_asok(['flush', 'journal'])
+
+        # Restart the MDS to drop the metadata cache (because we expired the journal,
+        # nothing gets replayed into cache on restart)
+        self.fs.mds_stop()
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        # remove testdir1's backtrace
+        self.fs.rados(["rmxattr", dir_objname, "parent"])
+
+        # readdir (fetch dirfrag) should fix testdir1's backtrace
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        self.mount_a.run_shell(["ls", "testdir1"])
+
+        # flush journal entries to dirfrag objects
+        self.fs.mds_asok(['flush', 'journal'])
+
+        # check if backtrace exists
+        self.fs.rados(["getxattr", dir_objname, "parent"])
+
+    def test_mds_readonly(self):
+        """
+        test if MDS behave correct when it's readonly
+        """
+        # operation should successd when MDS is not readonly
+        self.mount_a.run_shell(["touch", "test_file1"])
+        writer = self.mount_a.write_background(loop=True)
+
+        time.sleep(10)
+        self.assertFalse(writer.finished)
+
+        # force MDS to read-only mode
+        self.fs.mds_asok(['force_readonly'])
+        time.sleep(10)
+
+        # touching test file should fail
+        try:
+            self.mount_a.run_shell(["touch", "test_file1"])
+        except CommandFailedError:
+            pass
+        else:
+            self.assertTrue(False)
+
+        # background writer also should fail
+        self.assertTrue(writer.finished)
+
+        # The MDS should report its readonly health state to the mon
+        self.wait_for_health("MDS_READ_ONLY", timeout=30)
+
+        # restart mds to make it writable
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        self.wait_for_health_clear(timeout=30)
diff --git a/qa/tasks/cephfs/test_backtrace.py b/qa/tasks/cephfs/test_backtrace.py
new file mode 100644
index 00000000..af246a1e
--- /dev/null
+++ b/qa/tasks/cephfs/test_backtrace.py
@@ -0,0 +1,78 @@
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+
+class TestBacktrace(CephFSTestCase):
+    def test_backtrace(self):
+        """
+        That the 'parent' and 'layout' xattrs on the head objects of files
+        are updated correctly.
+        """
+
+        old_data_pool_name = self.fs.get_data_pool_name()
+        old_pool_id = self.fs.get_data_pool_id()
+
+        # Create a file for subsequent checks
+        self.mount_a.run_shell(["mkdir", "parent_a"])
+        self.mount_a.run_shell(["touch", "parent_a/alpha"])
+        file_ino = self.mount_a.path_to_ino("parent_a/alpha")
+
+        # That backtrace and layout are written after initial flush
+        self.fs.mds_asok(["flush", "journal"])
+        backtrace = self.fs.read_backtrace(file_ino)
+        self.assertEqual(['alpha', 'parent_a'], [a['dname'] for a in backtrace['ancestors']])
+        layout = self.fs.read_layout(file_ino)
+        self.assertDictEqual(layout, {
+            "stripe_unit": 4194304,
+            "stripe_count": 1,
+            "object_size": 4194304,
+            "pool_id": old_pool_id,
+            "pool_ns": "",
+        })
+        self.assertEqual(backtrace['pool'], old_pool_id)
+
+        # That backtrace is written after parentage changes
+        self.mount_a.run_shell(["mkdir", "parent_b"])
+        self.mount_a.run_shell(["mv", "parent_a/alpha", "parent_b/alpha"])
+
+        self.fs.mds_asok(["flush", "journal"])
+        backtrace = self.fs.read_backtrace(file_ino)
+        self.assertEqual(['alpha', 'parent_b'], [a['dname'] for a in backtrace['ancestors']])
+
+        # Create a new data pool
+        new_pool_name = "data_new"
+        new_pool_id = self.fs.add_data_pool(new_pool_name)
+
+        # That an object which has switched pools gets its backtrace updated
+        self.mount_a.setfattr("./parent_b/alpha",
+                              "ceph.file.layout.pool", new_pool_name)
+        self.fs.mds_asok(["flush", "journal"])
+        backtrace_old_pool = self.fs.read_backtrace(file_ino, pool=old_data_pool_name)
+        self.assertEqual(backtrace_old_pool['pool'], new_pool_id)
+        backtrace_new_pool = self.fs.read_backtrace(file_ino, pool=new_pool_name)
+        self.assertEqual(backtrace_new_pool['pool'], new_pool_id)
+        new_pool_layout = self.fs.read_layout(file_ino, pool=new_pool_name)
+        self.assertEqual(new_pool_layout['pool_id'], new_pool_id)
+        self.assertEqual(new_pool_layout['pool_ns'], '')
+
+        # That subsequent linkage changes are only written to new pool backtrace
+        self.mount_a.run_shell(["mkdir", "parent_c"])
+        self.mount_a.run_shell(["mv", "parent_b/alpha", "parent_c/alpha"])
+        self.fs.mds_asok(["flush", "journal"])
+        backtrace_old_pool = self.fs.read_backtrace(file_ino, pool=old_data_pool_name)
+        self.assertEqual(['alpha', 'parent_b'], [a['dname'] for a in backtrace_old_pool['ancestors']])
+        backtrace_new_pool = self.fs.read_backtrace(file_ino, pool=new_pool_name)
+        self.assertEqual(['alpha', 'parent_c'], [a['dname'] for a in backtrace_new_pool['ancestors']])
+
+        # That layout is written to new pool after change to other field in layout
+        self.mount_a.setfattr("./parent_c/alpha",
+                              "ceph.file.layout.object_size", "8388608")
+
+        self.fs.mds_asok(["flush", "journal"])
+        new_pool_layout = self.fs.read_layout(file_ino, pool=new_pool_name)
+        self.assertEqual(new_pool_layout['object_size'], 8388608)
+
+        # ...but not to the old pool: the old pool's backtrace points to the new pool, and that's enough,
+        # we don't update the layout in all the old pools whenever it changes
+        old_pool_layout = self.fs.read_layout(file_ino, pool=old_data_pool_name)
+        self.assertEqual(old_pool_layout['object_size'], 4194304)
diff --git a/qa/tasks/cephfs/test_cap_flush.py b/qa/tasks/cephfs/test_cap_flush.py
new file mode 100644
index 00000000..27b9af67
--- /dev/null
+++ b/qa/tasks/cephfs/test_cap_flush.py
@@ -0,0 +1,64 @@
+
+import os
+import time
+from textwrap import dedent
+from unittest import SkipTest
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+
+class TestCapFlush(CephFSTestCase):
+    @for_teuthology
+    def test_replay_create(self):
+        """
+        MDS starts to handle client caps when it enters clientreplay stage.
+        When handling a client cap in clientreplay stage, it's possible that
+        corresponding inode does not exist because the client request which
+        creates inode hasn't been replayed.
+        """
+
+        if not isinstance(self.mount_a, FuseMount):
+            raise SkipTest("Require FUSE client to inject client release failure")
+
+        dir_path = os.path.join(self.mount_a.mountpoint, "testdir")
+        py_script = dedent("""
+            import os
+            os.mkdir("{0}")
+            fd = os.open("{0}", os.O_RDONLY)
+            os.fchmod(fd, 0o777)
+            os.fsync(fd)
+            """).format(dir_path)
+        self.mount_a.run_python(py_script)
+
+        self.fs.mds_asok(["flush", "journal"])
+
+        # client will only get unsafe replay
+        self.fs.mds_asok(["config", "set", "mds_log_pause", "1"])
+
+        file_name = "testfile"
+        file_path = dir_path + "/" + file_name
+
+        # Create a file and modify its mode. ceph-fuse will mark Ax cap dirty
+        py_script = dedent("""
+            import os
+            os.chdir("{0}")
+            os.setgid(65534)
+            os.setuid(65534)
+            fd = os.open("{1}", os.O_CREAT | os.O_RDWR, 0o644)
+            os.fchmod(fd, 0o640)
+            """).format(dir_path, file_name)
+        self.mount_a.run_python(py_script)
+
+        # Modify file mode by different user. ceph-fuse will send a setattr request
+        self.mount_a.run_shell(["chmod", "600", file_path], wait=False)
+
+        time.sleep(10)
+
+        # Restart mds. Client will re-send the unsafe request and cap flush
+        self.fs.mds_stop()
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        mode = self.mount_a.run_shell(['stat', '-c' '%a', file_path]).stdout.getvalue().strip()
+        # If the cap flush get dropped, mode should be 0644.
+        # (Ax cap stays in dirty state, which prevents setattr reply from updating file mode)
+        self.assertEqual(mode, "600")
diff --git a/qa/tasks/cephfs/test_cephfs_shell.py b/qa/tasks/cephfs/test_cephfs_shell.py
new file mode 100644
index 00000000..8ddbaedb
--- /dev/null
+++ b/qa/tasks/cephfs/test_cephfs_shell.py
@@ -0,0 +1,279 @@
+import os
+import crypt
+import logging
+from six import StringIO
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+
+class TestCephFSShell(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+
+    def _cephfs_shell(self, cmd, opts=None, stdin=None):
+        args = ["cephfs-shell", "-c", self.mount_a.config_path]
+        if opts is not None:
+            args.extend(opts)
+        args.extend(("--", cmd))
+        log.info("Running command: {}".format(" ".join(args)))
+        status = self.mount_a.client_remote.run(args=args, stdout=StringIO(),
+                                                stdin=stdin)
+        return status.stdout.getvalue().strip()
+
+    def test_help(self):
+        """
+        Test that help outputs commands.
+        """
+
+        o = self._cephfs_shell("help")
+
+        log.info("output:\n{}".format(o))
+
+    def test_mkdir(self):
+        """
+        Test that mkdir creates directory
+        """
+        o = self._cephfs_shell("mkdir d1")
+        log.info("cephfs-shell output:\n{}".format(o))
+
+        o = self.mount_a.stat('d1')
+        log.info("mount_a output:\n{}".format(o))
+
+    def test_mkdir_with_07000_octal_mode(self):
+        """
+        Test that mkdir fails with octal mode greater than 0777
+        """
+        o = self._cephfs_shell("mkdir -m 07000 d2")
+        log.info("cephfs-shell output:\n{}".format(o))
+
+        # mkdir d2 should fail
+        try:
+            o = self.mount_a.stat('d2')
+            log.info("mount_a output:\n{}".format(o))
+        except:
+            pass
+
+    def test_mkdir_with_negative_octal_mode(self):
+        """
+        Test that mkdir fails with negative octal mode
+        """
+        o = self._cephfs_shell("mkdir -m -0755 d3")
+        log.info("cephfs-shell output:\n{}".format(o))
+
+        # mkdir d3 should fail
+        try:
+            o = self.mount_a.stat('d3')
+            log.info("mount_a output:\n{}".format(o))
+        except:
+            pass
+
+    def test_mkdir_with_non_octal_mode(self):
+        """
+        Test that mkdir passes with non-octal mode
+        """
+        o = self._cephfs_shell("mkdir -m u=rwx d4")
+        log.info("cephfs-shell output:\n{}".format(o))
+
+        # mkdir d4 should pass
+        o = self.mount_a.stat('d4')
+        assert((o['st_mode'] & 0o700) == 0o700)
+
+    def test_mkdir_with_bad_non_octal_mode(self):
+        """
+        Test that mkdir failes with bad non-octal mode
+        """
+        o = self._cephfs_shell("mkdir -m ugx=0755 d5")
+        log.info("cephfs-shell output:\n{}".format(o))
+
+        # mkdir d5 should fail
+        try:
+            o = self.mount_a.stat('d5')
+            log.info("mount_a output:\n{}".format(o))
+        except:
+            pass
+
+    def test_mkdir_path_without_path_option(self):
+        """
+        Test that mkdir fails without path option for creating path
+        """
+        o = self._cephfs_shell("mkdir d5/d6/d7")
+        log.info("cephfs-shell output:\n{}".format(o))
+
+        # mkdir d5/d6/d7 should fail
+        try:
+            o = self.mount_a.stat('d5/d6/d7')
+            log.info("mount_a output:\n{}".format(o))
+        except:
+            pass
+
+    def test_mkdir_path_with_path_option(self):
+        """
+        Test that mkdir passes with path option for creating path
+        """
+        o = self._cephfs_shell("mkdir -p d5/d6/d7")
+        log.info("cephfs-shell output:\n{}".format(o))
+
+        # mkdir d5/d6/d7 should pass
+        o = self.mount_a.stat('d5/d6/d7')
+        log.info("mount_a output:\n{}".format(o))
+
+    def validate_stat_output(self, s):
+        l = s.split('\n')
+        log.info("lines:\n{}".format(l))
+        rv = l[-1] # get last line; a failed stat will have "1" as the line
+        log.info("rv:{}".format(rv))
+        r = 0
+        try:
+            r = int(rv) # a non-numeric line will cause an exception
+        except:
+            pass
+        assert(r == 0)
+
+    def test_put_and_get_without_target_directory(self):
+        """
+        Test that put fails without target path
+        """
+        # generate test data in a directory
+        self._cephfs_shell("!mkdir p1")
+        self._cephfs_shell('!dd if=/dev/urandom of=p1/dump1 bs=1M count=1')
+        self._cephfs_shell('!dd if=/dev/urandom of=p1/dump2 bs=2M count=1')
+        self._cephfs_shell('!dd if=/dev/urandom of=p1/dump3 bs=3M count=1')
+
+        # copy the whole directory over to the cephfs
+        o = self._cephfs_shell("put p1")
+        log.info("cephfs-shell output:\n{}".format(o))
+
+        # put p1 should pass
+        o = self.mount_a.stat('p1')
+        log.info("mount_a output:\n{}".format(o))
+        o = self.mount_a.stat('p1/dump1')
+        log.info("mount_a output:\n{}".format(o))
+        o = self.mount_a.stat('p1/dump2')
+        log.info("mount_a output:\n{}".format(o))
+        o = self.mount_a.stat('p1/dump3')
+        log.info("mount_a output:\n{}".format(o))
+
+        self._cephfs_shell('!rm -rf p1')
+        o = self._cephfs_shell("get p1")
+        o = self._cephfs_shell('!stat p1 || echo $?')
+        log.info("cephfs-shell output:\n{}".format(o))
+        self.validate_stat_output(o)
+
+        o = self._cephfs_shell('!stat p1/dump1 || echo $?')
+        log.info("cephfs-shell output:\n{}".format(o))
+        self.validate_stat_output(o)
+
+        o = self._cephfs_shell('!stat p1/dump2 || echo $?')
+        log.info("cephfs-shell output:\n{}".format(o))
+        self.validate_stat_output(o)
+
+        o = self._cephfs_shell('!stat p1/dump3 || echo $?')
+        log.info("cephfs-shell output:\n{}".format(o))
+        self.validate_stat_output(o)
+
+    # the 'put' command gets tested as well with the 'get' comamnd
+    def test_get_with_target_name(self):
+        """
+        Test that get passes with target name
+        """
+        s = 'C' * 1024
+        s_hash = crypt.crypt(s, '.A')
+        o = self._cephfs_shell("put - dump4", stdin=s)
+        log.info("cephfs-shell output:\n{}".format(o))
+
+        # put - dump4 should pass
+        o = self.mount_a.stat('dump4')
+        log.info("mount_a output:\n{}".format(o))
+
+        o = self._cephfs_shell("get dump4 .")
+        log.info("cephfs-shell output:\n{}".format(o))
+
+        o = self._cephfs_shell("!cat dump4")
+        o_hash = crypt.crypt(o, '.A')
+
+        # s_hash must be equal to o_hash
+        log.info("s_hash:{}".format(s_hash))
+        log.info("o_hash:{}".format(o_hash))
+        assert(s_hash == o_hash)
+
+    def test_get_without_target_name(self):
+        """
+        Test that get passes with target name
+        """
+        s = 'D' * 1024
+        o = self._cephfs_shell("put - dump5", stdin=s)
+        log.info("cephfs-shell output:\n{}".format(o))
+
+        # put - dump5 should pass
+        o = self.mount_a.stat('dump5')
+        log.info("mount_a output:\n{}".format(o))
+
+        # get dump5 should fail
+        o = self._cephfs_shell("get dump5")
+        o = self._cephfs_shell("!stat dump5 || echo $?")
+        log.info("cephfs-shell output:\n{}".format(o))
+        l = o.split('\n')
+        try:
+            ret = int(l[1])
+            # verify that stat dump5 passes
+            # if ret == 1, then that implies the stat failed
+            # which implies that there was a problem with "get dump5"
+            assert(ret != 1)
+        except ValueError:
+            # we have a valid stat output; so this is good
+            # if the int() fails then that means there's a valid stat output
+            pass
+
+    def test_get_to_console(self):
+        """
+        Test that get passes with target name
+        """
+        s = 'E' * 1024
+        s_hash = crypt.crypt(s, '.A')
+        o = self._cephfs_shell("put - dump6", stdin=s)
+        log.info("cephfs-shell output:\n{}".format(o))
+
+        # put - dump6 should pass
+        o = self.mount_a.stat('dump6')
+        log.info("mount_a output:\n{}".format(o))
+
+        # get dump6 - should pass
+        o = self._cephfs_shell("get dump6 -")
+        o_hash = crypt.crypt(o, '.A')
+        log.info("cephfs-shell output:\n{}".format(o))
+
+        # s_hash must be equal to o_hash
+        log.info("s_hash:{}".format(s_hash))
+        log.info("o_hash:{}".format(o_hash))
+        assert(s_hash == o_hash)
+
+#    def test_ls(self):
+#        """
+#        Test that ls passes
+#        """
+#        o = self._cephfs_shell("ls")
+#        log.info("cephfs-shell output:\n{}".format(o))
+#
+#        o = self.mount_a.run_shell(['ls']).stdout.getvalue().strip().replace("\n", " ").split()
+#        log.info("mount_a output:\n{}".format(o))
+#
+#        # ls should not list hidden files without the -a switch
+#        if '.' in o or '..' in o:
+#            log.info('ls failed')
+#        else:
+#            log.info('ls succeeded')
+#
+#    def test_ls_a(self):
+#        """
+#        Test that ls -a passes
+#        """
+#        o = self._cephfs_shell("ls -a")
+#        log.info("cephfs-shell output:\n{}".format(o))
+#
+#        o = self.mount_a.run_shell(['ls', '-a']).stdout.getvalue().strip().replace("\n", " ").split()
+#        log.info("mount_a output:\n{}".format(o))
+#
+#        if '.' in o and '..' in o:
+#            log.info('ls -a succeeded')
+#        else:
+#            log.info('ls -a failed')
diff --git a/qa/tasks/cephfs/test_client_limits.py b/qa/tasks/cephfs/test_client_limits.py
new file mode 100644
index 00000000..613a405a
--- /dev/null
+++ b/qa/tasks/cephfs/test_client_limits.py
@@ -0,0 +1,330 @@
+
+"""
+Exercise the MDS's behaviour when clients and the MDCache reach or
+exceed the limits of how many caps/inodes they should hold.
+"""
+
+import logging
+from textwrap import dedent
+from unittest import SkipTest
+from teuthology.orchestra.run import CommandFailedError
+from tasks.ceph_test_case import TestTimeoutError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, needs_trimming
+from tasks.cephfs.fuse_mount import FuseMount
+import os
+
+
+log = logging.getLogger(__name__)
+
+
+# Arbitrary timeouts for operations involving restarting
+# an MDS or waiting for it to come up
+MDS_RESTART_GRACE = 60
+
+# Hardcoded values from Server::recall_client_state
+CAP_RECALL_RATIO = 0.8
+CAP_RECALL_MIN = 100
+
+
+class TestClientLimits(CephFSTestCase):
+    REQUIRE_KCLIENT_REMOTE = True
+    CLIENTS_REQUIRED = 2
+
+    def _test_client_pin(self, use_subdir, open_files):
+        """
+        When a client pins an inode in its cache, for example because the file is held open,
+        it should reject requests from the MDS to trim these caps.  The MDS should complain
+        to the user that it is unable to enforce its cache size limits because of this
+        objectionable client.
+
+        :param use_subdir: whether to put test files in a subdir or use root
+        """
+
+        self.config_set('mds', 'mds_cache_memory_limit', "1K")
+        self.config_set('mds', 'mds_recall_max_caps', int(open_files/2))
+        self.config_set('mds', 'mds_recall_warning_threshold', open_files)
+
+        mds_min_caps_per_client = int(self.config_get('mds.a', "mds_min_caps_per_client"))
+        self.config_set('mds', 'mds_min_caps_working_set', mds_min_caps_per_client)
+        mds_recall_warning_decay_rate = float(self.config_get('mds.a', "mds_recall_warning_decay_rate"))
+        self.assertGreaterEqual(open_files, mds_min_caps_per_client)
+
+        mount_a_client_id = self.mount_a.get_global_id()
+        path = "subdir" if use_subdir else "."
+        open_proc = self.mount_a.open_n_background(path, open_files)
+
+        # Client should now hold:
+        # `open_files` caps for the open files
+        # 1 cap for root
+        # 1 cap for subdir
+        self.wait_until_equal(lambda: self.get_session(mount_a_client_id)['num_caps'],
+                              open_files + (2 if use_subdir else 1),
+                              timeout=600,
+                              reject_fn=lambda x: x > open_files + 2)
+
+        # MDS should not be happy about that, as the client is failing to comply
+        # with the SESSION_RECALL messages it is being sent
+        self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_warning_decay_rate*2)
+
+        # We can also test that the MDS health warning for oversized
+        # cache is functioning as intended.
+        self.wait_for_health("MDS_CACHE_OVERSIZED", mds_recall_warning_decay_rate*2)
+
+        # When the client closes the files, it should retain only as many caps as allowed
+        # under the SESSION_RECALL policy
+        log.info("Terminating process holding files open")
+        open_proc.stdin.close()
+        try:
+            open_proc.wait()
+        except CommandFailedError:
+            # We killed it, so it raises an error
+            pass
+
+        # The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message,
+        # which depend on the caps outstanding, cache size and overall ratio
+        def expected_caps():
+            num_caps = self.get_session(mount_a_client_id)['num_caps']
+            if num_caps <= mds_min_caps_per_client:
+                return True
+            else:
+                return False
+
+        self.wait_until_true(expected_caps, timeout=60)
+
+    @needs_trimming
+    def test_client_pin_root(self):
+        self._test_client_pin(False, 400)
+
+    @needs_trimming
+    def test_client_pin(self):
+        self._test_client_pin(True, 800)
+
+    @needs_trimming
+    def test_client_pin_mincaps(self):
+        self._test_client_pin(True, 200)
+
+    def test_client_min_caps_working_set(self):
+        """
+        When a client has inodes pinned in its cache (open files), that the MDS
+        will not warn about the client not responding to cache pressure when
+        the number of caps is below mds_min_caps_working_set.
+        """
+
+        # Set MDS cache memory limit to a low value that will make the MDS to
+        # ask the client to trim the caps.
+        cache_memory_limit = "1K"
+        open_files = 400
+
+        self.config_set('mds', 'mds_cache_memory_limit', cache_memory_limit)
+        self.config_set('mds', 'mds_recall_max_caps', int(open_files/2))
+        self.config_set('mds', 'mds_recall_warning_threshold', open_files)
+        self.config_set('mds', 'mds_min_caps_working_set', open_files*2)
+
+        mds_min_caps_per_client = int(self.config_get('mds.a', "mds_min_caps_per_client"))
+        mds_recall_warning_decay_rate = float(self.config_get('mds.a', "mds_recall_warning_decay_rate"))
+        self.assertGreaterEqual(open_files, mds_min_caps_per_client)
+
+        mount_a_client_id = self.mount_a.get_global_id()
+        self.mount_a.open_n_background("subdir", open_files)
+
+        # Client should now hold:
+        # `open_files` caps for the open files
+        # 1 cap for root
+        # 1 cap for subdir
+        self.wait_until_equal(lambda: self.get_session(mount_a_client_id)['num_caps'],
+                              open_files + 2,
+                              timeout=600,
+                              reject_fn=lambda x: x > open_files + 2)
+
+        # We can also test that the MDS health warning for oversized
+        # cache is functioning as intended.
+        self.wait_for_health("MDS_CACHE_OVERSIZED", mds_recall_warning_decay_rate*2)
+
+        try:
+            # MDS should not be happy about that but it's not sending
+            # MDS_CLIENT_RECALL warnings because the client's caps are below
+            # mds_min_caps_working_set.
+            self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_warning_decay_rate*2)
+        except TestTimeoutError:
+            pass
+        else:
+            raise RuntimeError("expected no client recall warning")
+
+    def test_cap_acquisition_throttle_readdir(self):
+        """
+        Mostly readdir acquires caps faster than the mds recalls, so the cap
+        acquisition via readdir is throttled by retrying the readdir after
+        a fraction of second (0.5) by default when throttling condition is met.
+        """
+
+        max_caps_per_client = 500
+        cap_acquisition_throttle = 250
+
+        self.config_set('mds', 'mds_max_caps_per_client', max_caps_per_client)
+        self.config_set('mds', 'mds_session_cap_acquisition_throttle', cap_acquisition_throttle)
+
+        # Create 1500 files split across 6 directories, 250 each.
+        for i in range(1, 7):
+            self.mount_a.create_n_files("dir{0}/file".format(i), cap_acquisition_throttle, sync=True)
+
+        mount_a_client_id = self.mount_a.get_global_id()
+
+        # recursive readdir
+        self.mount_a.run_shell_payload("find | wc")
+
+        # validate cap_acquisition decay counter after readdir to exceed throttle count i.e 250
+        cap_acquisition_value = self.get_session(mount_a_client_id)['cap_acquisition']['value']
+        self.assertGreaterEqual(cap_acquisition_value, cap_acquisition_throttle)
+
+        # validate the throttle condition to be hit atleast once
+        cap_acquisition_throttle_hit_count = self.perf_dump()['mds_server']['cap_acquisition_throttle']
+        self.assertGreaterEqual(cap_acquisition_throttle_hit_count, 1)
+
+    def test_client_release_bug(self):
+        """
+        When a client has a bug (which we will simulate) preventing it from releasing caps,
+        the MDS should notice that releases are not being sent promptly, and generate a health
+        metric to that effect.
+        """
+
+        # The debug hook to inject the failure only exists in the fuse client
+        if not isinstance(self.mount_a, FuseMount):
+            raise SkipTest("Require FUSE client to inject client release failure")
+
+        self.set_conf('client.{0}'.format(self.mount_a.client_id), 'client inject release failure', 'true')
+        self.mount_a.teardown()
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        mount_a_client_id = self.mount_a.get_global_id()
+
+        # Client A creates a file.  He will hold the write caps on the file, and later (simulated bug) fail
+        # to comply with the MDSs request to release that cap
+        self.mount_a.run_shell(["touch", "file1"])
+
+        # Client B tries to stat the file that client A created
+        rproc = self.mount_b.write_background("file1")
+
+        # After session_timeout, we should see a health warning (extra lag from
+        # MDS beacon period)
+        session_timeout = self.fs.get_var("session_timeout")
+        self.wait_for_health("MDS_CLIENT_LATE_RELEASE", session_timeout + 10)
+
+        # Client B should still be stuck
+        self.assertFalse(rproc.finished)
+
+        # Kill client A
+        self.mount_a.kill()
+        self.mount_a.kill_cleanup()
+
+        # Client B should complete
+        self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+        rproc.wait()
+
+    def test_client_oldest_tid(self):
+        """
+        When a client does not advance its oldest tid, the MDS should notice that
+        and generate health warnings.
+        """
+
+        # num of requests client issues
+        max_requests = 1000
+
+        # The debug hook to inject the failure only exists in the fuse client
+        if not isinstance(self.mount_a, FuseMount):
+            raise SkipTest("Require FUSE client to inject client release failure")
+
+        self.set_conf('client', 'client inject fixed oldest tid', 'true')
+        self.mount_a.teardown()
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        self.fs.mds_asok(['config', 'set', 'mds_max_completed_requests', '{0}'.format(max_requests)])
+
+        # Create lots of files
+        self.mount_a.create_n_files("testdir/file1", max_requests + 100)
+
+        # Create a few files synchronously. This makes sure previous requests are completed
+        self.mount_a.create_n_files("testdir/file2", 5, True)
+
+        # Wait for the health warnings. Assume mds can handle 10 request per second at least
+        self.wait_for_health("MDS_CLIENT_OLDEST_TID", max_requests // 10)
+
+    def _test_client_cache_size(self, mount_subdir):
+        """
+        check if client invalidate kernel dcache according to its cache size config
+        """
+
+        # The debug hook to inject the failure only exists in the fuse client
+        if not isinstance(self.mount_a, FuseMount):
+            raise SkipTest("Require FUSE client to inject client release failure")
+
+        if mount_subdir:
+            # fuse assigns a fix inode number (1) to root inode. But in mounting into
+            # subdir case, the actual inode number of root is not 1. This mismatch
+            # confuses fuse_lowlevel_notify_inval_entry() when invalidating dentries
+            # in root directory.
+            self.mount_a.run_shell(["mkdir", "subdir"])
+            self.mount_a.umount_wait()
+            self.set_conf('client', 'client mountpoint', '/subdir')
+            self.mount_a.mount()
+            self.mount_a.wait_until_mounted()
+            root_ino = self.mount_a.path_to_ino(".")
+            self.assertEqual(root_ino, 1);
+
+        dir_path = os.path.join(self.mount_a.mountpoint, "testdir")
+
+        mkdir_script = dedent("""
+            import os
+            os.mkdir("{path}")
+            for n in range(0, {num_dirs}):
+                os.mkdir("{path}/dir{{0}}".format(n))
+            """)
+
+        num_dirs = 1000
+        self.mount_a.run_python(mkdir_script.format(path=dir_path, num_dirs=num_dirs))
+        self.mount_a.run_shell(["sync"])
+
+        dentry_count, dentry_pinned_count = self.mount_a.get_dentry_count()
+        self.assertGreaterEqual(dentry_count, num_dirs)
+        self.assertGreaterEqual(dentry_pinned_count, num_dirs)
+
+        cache_size = num_dirs // 10
+        self.mount_a.set_cache_size(cache_size)
+
+        def trimmed():
+            dentry_count, dentry_pinned_count = self.mount_a.get_dentry_count()
+            log.info("waiting, dentry_count, dentry_pinned_count: {0}, {1}".format(
+                dentry_count, dentry_pinned_count
+            ))
+            if dentry_count > cache_size or dentry_pinned_count > cache_size:
+                return False
+
+            return True
+
+        self.wait_until_true(trimmed, 30)
+
+    @needs_trimming
+    def test_client_cache_size(self):
+        self._test_client_cache_size(False)
+        self._test_client_cache_size(True)
+
+    def test_client_max_caps(self):
+        """
+        That the MDS will not let a client sit above mds_max_caps_per_client caps.
+        """
+
+        mds_min_caps_per_client = int(self.config_get('mds.a', "mds_min_caps_per_client"))
+        mds_max_caps_per_client = 2*mds_min_caps_per_client
+        self.config_set('mds', 'mds_max_caps_per_client', mds_max_caps_per_client)
+
+        self.mount_a.create_n_files("foo/", 3*mds_max_caps_per_client, sync=True)
+
+        mount_a_client_id = self.mount_a.get_global_id()
+        def expected_caps():
+            num_caps = self.get_session(mount_a_client_id)['num_caps']
+            if num_caps <= mds_max_caps_per_client:
+                return True
+            else:
+                return False
+
+        self.wait_until_true(expected_caps, timeout=60)
diff --git a/qa/tasks/cephfs/test_client_recovery.py b/qa/tasks/cephfs/test_client_recovery.py
new file mode 100644
index 00000000..c7806b71
--- /dev/null
+++ b/qa/tasks/cephfs/test_client_recovery.py
@@ -0,0 +1,633 @@
+
+"""
+Teuthology task for exercising CephFS client recovery
+"""
+
+import logging
+from textwrap import dedent
+import time
+import distutils.version as version
+import re
+import os
+
+from teuthology.orchestra.run import CommandFailedError, ConnectionLostError
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.packaging import get_package_version
+from unittest import SkipTest
+
+
+log = logging.getLogger(__name__)
+
+
+# Arbitrary timeouts for operations involving restarting
+# an MDS or waiting for it to come up
+MDS_RESTART_GRACE = 60
+
+
+class TestClientNetworkRecovery(CephFSTestCase):
+    REQUIRE_KCLIENT_REMOTE = True
+    REQUIRE_ONE_CLIENT_REMOTE = True
+    CLIENTS_REQUIRED = 2
+
+    LOAD_SETTINGS = ["mds_reconnect_timeout", "ms_max_backoff"]
+
+    # Environment references
+    mds_reconnect_timeout = None
+    ms_max_backoff = None
+
+    def test_network_death(self):
+        """
+        Simulate software freeze or temporary network failure.
+
+        Check that the client blocks I/O during failure, and completes
+        I/O after failure.
+        """
+
+        session_timeout = self.fs.get_var("session_timeout")
+        self.fs.mds_asok(['config', 'set', 'mds_defer_session_stale', 'false'])
+
+        # We only need one client
+        self.mount_b.umount_wait()
+
+        # Initially our one client session should be visible
+        client_id = self.mount_a.get_global_id()
+        ls_data = self._session_list()
+        self.assert_session_count(1, ls_data)
+        self.assertEqual(ls_data[0]['id'], client_id)
+        self.assert_session_state(client_id, "open")
+
+        # ...and capable of doing I/O without blocking
+        self.mount_a.create_files()
+
+        # ...but if we turn off the network
+        self.fs.set_clients_block(True)
+
+        # ...and try and start an I/O
+        write_blocked = self.mount_a.write_background()
+
+        # ...then it should block
+        self.assertFalse(write_blocked.finished)
+        self.assert_session_state(client_id, "open")
+        time.sleep(session_timeout * 1.5)  # Long enough for MDS to consider session stale
+        self.assertFalse(write_blocked.finished)
+        self.assert_session_state(client_id, "stale")
+
+        # ...until we re-enable I/O
+        self.fs.set_clients_block(False)
+
+        # ...when it should complete promptly
+        a = time.time()
+        self.wait_until_true(lambda: write_blocked.finished, self.ms_max_backoff * 2)
+        write_blocked.wait()  # Already know we're finished, wait() to raise exception on errors
+        recovery_time = time.time() - a
+        log.info("recovery time: {0}".format(recovery_time))
+        self.assert_session_state(client_id, "open")
+
+
+class TestClientRecovery(CephFSTestCase):
+    REQUIRE_KCLIENT_REMOTE = True
+    CLIENTS_REQUIRED = 2
+
+    LOAD_SETTINGS = ["mds_reconnect_timeout", "ms_max_backoff"]
+
+    # Environment references
+    mds_reconnect_timeout = None
+    ms_max_backoff = None
+
+    def test_basic(self):
+        # Check that two clients come up healthy and see each others' files
+        # =====================================================
+        self.mount_a.create_files()
+        self.mount_a.check_files()
+        self.mount_a.umount_wait()
+
+        self.mount_b.check_files()
+
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        # Check that the admin socket interface is correctly reporting
+        # two sessions
+        # =====================================================
+        ls_data = self._session_list()
+        self.assert_session_count(2, ls_data)
+
+        self.assertSetEqual(
+            set([l['id'] for l in ls_data]),
+            {self.mount_a.get_global_id(), self.mount_b.get_global_id()}
+        )
+
+    def test_restart(self):
+        # Check that after an MDS restart both clients reconnect and continue
+        # to handle I/O
+        # =====================================================
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
+
+        self.mount_a.create_destroy()
+        self.mount_b.create_destroy()
+
+    def _session_num_caps(self, client_id):
+        ls_data = self.fs.mds_asok(['session', 'ls'])
+        return int(self._session_by_id(ls_data).get(client_id, {'num_caps': None})['num_caps'])
+
+    def test_reconnect_timeout(self):
+        # Reconnect timeout
+        # =================
+        # Check that if I stop an MDS and a client goes away, the MDS waits
+        # for the reconnect period
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        mount_a_client_id = self.mount_a.get_global_id()
+        self.mount_a.umount_wait(force=True)
+
+        self.fs.mds_restart()
+
+        self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE)
+        # Check that the MDS locally reports its state correctly
+        status = self.fs.mds_asok(['status'])
+        self.assertIn("reconnect_status", status)
+
+        ls_data = self._session_list()
+        self.assert_session_count(2, ls_data)
+
+        # The session for the dead client should have the 'reconnect' flag set
+        self.assertTrue(self.get_session(mount_a_client_id)['reconnecting'])
+
+        # Wait for the reconnect state to clear, this should take the
+        # reconnect timeout period.
+        in_reconnect_for = self.fs.wait_for_state('up:active', timeout=self.mds_reconnect_timeout * 2)
+        # Check that the period we waited to enter active is within a factor
+        # of two of the reconnect timeout.
+        self.assertGreater(in_reconnect_for, self.mds_reconnect_timeout // 2,
+                           "Should have been in reconnect phase for {0} but only took {1}".format(
+                               self.mds_reconnect_timeout, in_reconnect_for
+                           ))
+
+        self.assert_session_count(1)
+
+        # Check that the client that timed out during reconnect can
+        # mount again and do I/O
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        self.mount_a.create_destroy()
+
+        self.assert_session_count(2)
+
+    def test_reconnect_eviction(self):
+        # Eviction during reconnect
+        # =========================
+        mount_a_client_id = self.mount_a.get_global_id()
+
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # The mount goes away while the MDS is offline
+        self.mount_a.kill()
+
+        # wait for it to die
+        time.sleep(5)
+
+        self.fs.mds_restart()
+
+        # Enter reconnect phase
+        self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE)
+        self.assert_session_count(2)
+
+        # Evict the stuck client
+        self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+        self.assert_session_count(1)
+
+        # Observe that we proceed to active phase without waiting full reconnect timeout
+        evict_til_active = self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
+        # Once we evict the troublemaker, the reconnect phase should complete
+        # in well under the reconnect timeout.
+        self.assertLess(evict_til_active, self.mds_reconnect_timeout * 0.5,
+                        "reconnect did not complete soon enough after eviction, took {0}".format(
+                            evict_til_active
+                        ))
+
+        # We killed earlier so must clean up before trying to use again
+        self.mount_a.kill_cleanup()
+
+        # Bring the client back
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        self.mount_a.create_destroy()
+
+    def _test_stale_caps(self, write):
+        session_timeout = self.fs.get_var("session_timeout")
+
+        # Capability release from stale session
+        # =====================================
+        if write:
+            cap_holder = self.mount_a.open_background()
+        else:
+            self.mount_a.run_shell(["touch", "background_file"])
+            self.mount_a.umount_wait()
+            self.mount_a.mount()
+            self.mount_a.wait_until_mounted()
+            cap_holder = self.mount_a.open_background(write=False)
+
+        self.assert_session_count(2)
+        mount_a_gid = self.mount_a.get_global_id()
+
+        # Wait for the file to be visible from another client, indicating
+        # that mount_a has completed its network ops
+        self.mount_b.wait_for_visible()
+
+        # Simulate client death
+        self.mount_a.kill()
+
+        # wait for it to die so it doesn't voluntarily release buffer cap
+        time.sleep(5)
+
+        try:
+            # Now, after session_timeout seconds, the waiter should
+            # complete their operation when the MDS marks the holder's
+            # session stale.
+            cap_waiter = self.mount_b.write_background()
+            a = time.time()
+            cap_waiter.wait()
+            b = time.time()
+
+            # Should have succeeded
+            self.assertEqual(cap_waiter.exitstatus, 0)
+
+            if write:
+                self.assert_session_count(1)
+            else:
+                self.assert_session_state(mount_a_gid, "stale")
+
+            cap_waited = b - a
+            log.info("cap_waiter waited {0}s".format(cap_waited))
+            self.assertTrue(session_timeout / 2.0 <= cap_waited <= session_timeout * 2.0,
+                            "Capability handover took {0}, expected approx {1}".format(
+                                cap_waited, session_timeout
+                            ))
+
+            cap_holder.stdin.close()
+            try:
+                cap_holder.wait()
+            except (CommandFailedError, ConnectionLostError):
+                # We killed it (and possibly its node), so it raises an error
+                pass
+        finally:
+            # teardown() doesn't quite handle this case cleanly, so help it out
+            self.mount_a.kill_cleanup()
+
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+    def test_stale_read_caps(self):
+        self._test_stale_caps(False)
+
+    def test_stale_write_caps(self):
+        self._test_stale_caps(True)
+
+    def test_evicted_caps(self):
+        # Eviction while holding a capability
+        # ===================================
+
+        session_timeout = self.fs.get_var("session_timeout")
+
+        # Take out a write capability on a file on client A,
+        # and then immediately kill it.
+        cap_holder = self.mount_a.open_background()
+        mount_a_client_id = self.mount_a.get_global_id()
+
+        # Wait for the file to be visible from another client, indicating
+        # that mount_a has completed its network ops
+        self.mount_b.wait_for_visible()
+
+        # Simulate client death
+        self.mount_a.kill()
+
+        # wait for it to die so it doesn't voluntarily release buffer cap
+        time.sleep(5)
+
+        try:
+            # The waiter should get stuck waiting for the capability
+            # held on the MDS by the now-dead client A
+            cap_waiter = self.mount_b.write_background()
+            time.sleep(5)
+            self.assertFalse(cap_waiter.finished)
+
+            self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+            # Now, because I evicted the old holder of the capability, it should
+            # immediately get handed over to the waiter
+            a = time.time()
+            cap_waiter.wait()
+            b = time.time()
+            cap_waited = b - a
+            log.info("cap_waiter waited {0}s".format(cap_waited))
+            # This is the check that it happened 'now' rather than waiting
+            # for the session timeout
+            self.assertLess(cap_waited, session_timeout / 2.0,
+                            "Capability handover took {0}, expected less than {1}".format(
+                                cap_waited, session_timeout / 2.0
+                            ))
+
+            cap_holder.stdin.close()
+            try:
+                cap_holder.wait()
+            except (CommandFailedError, ConnectionLostError):
+                # We killed it (and possibly its node), so it raises an error
+                pass
+        finally:
+            self.mount_a.kill_cleanup()
+
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+    def test_trim_caps(self):
+        # Trim capability when reconnecting MDS
+        # ===================================
+
+        count = 500
+        # Create lots of files
+        for i in range(count):
+            self.mount_a.run_shell(["touch", "f{0}".format(i)])
+
+        # Populate mount_b's cache
+        self.mount_b.run_shell(["ls", "-l"])
+
+        client_id = self.mount_b.get_global_id()
+        num_caps = self._session_num_caps(client_id)
+        self.assertGreaterEqual(num_caps, count)
+
+        # Restart MDS. client should trim its cache when reconnecting to the MDS
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
+
+        num_caps = self._session_num_caps(client_id)
+        self.assertLess(num_caps, count,
+                        "should have less than {0} capabilities, have {1}".format(
+                            count, num_caps
+                        ))
+
+    def _is_flockable(self):
+        a_version_str = get_package_version(self.mount_a.client_remote, "fuse")
+        b_version_str = get_package_version(self.mount_b.client_remote, "fuse")
+        flock_version_str = "2.9"
+
+        version_regex = re.compile(r"[0-9\.]+")
+        a_result = version_regex.match(a_version_str)
+        self.assertTrue(a_result)
+        b_result = version_regex.match(b_version_str)
+        self.assertTrue(b_result)
+        a_version = version.StrictVersion(a_result.group())
+        b_version = version.StrictVersion(b_result.group())
+        flock_version=version.StrictVersion(flock_version_str)
+
+        if (a_version >= flock_version and b_version >= flock_version):
+            log.info("flock locks are available")
+            return True
+        else:
+            log.info("not testing flock locks, machines have versions {av} and {bv}".format(
+                av=a_version_str,bv=b_version_str))
+            return False
+
+    def test_filelock(self):
+        """
+        Check that file lock doesn't get lost after an MDS restart
+        """
+
+        flockable = self._is_flockable()
+        lock_holder = self.mount_a.lock_background(do_flock=flockable)
+
+        self.mount_b.wait_for_visible("background_file-2")
+        self.mount_b.check_filelock(do_flock=flockable)
+
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
+
+        self.mount_b.check_filelock(do_flock=flockable)
+
+        # Tear down the background process
+        lock_holder.stdin.close()
+        try:
+            lock_holder.wait()
+        except (CommandFailedError, ConnectionLostError):
+            # We killed it, so it raises an error
+            pass
+
+    def test_filelock_eviction(self):
+        """
+        Check that file lock held by evicted client is given to
+        waiting client.
+        """
+        if not self._is_flockable():
+            self.skipTest("flock is not available")
+
+        lock_holder = self.mount_a.lock_background()
+        self.mount_b.wait_for_visible("background_file-2")
+        self.mount_b.check_filelock()
+
+        lock_taker = self.mount_b.lock_and_release()
+        # Check the taker is waiting (doesn't get it immediately)
+        time.sleep(2)
+        self.assertFalse(lock_holder.finished)
+        self.assertFalse(lock_taker.finished)
+
+        try:
+            mount_a_client_id = self.mount_a.get_global_id()
+            self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+
+            # Evicting mount_a should let mount_b's attempt to take the lock
+            # succeed
+            self.wait_until_true(lambda: lock_taker.finished, timeout=10)
+        finally:
+            # teardown() doesn't quite handle this case cleanly, so help it out
+            self.mount_a.kill()
+            self.mount_a.kill_cleanup()
+
+        # Bring the client back
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+    def test_dir_fsync(self):
+        self._test_fsync(True);
+
+    def test_create_fsync(self):
+        self._test_fsync(False);
+
+    def _test_fsync(self, dirfsync):
+        """
+        That calls to fsync guarantee visibility of metadata to another
+        client immediately after the fsyncing client dies.
+        """
+
+        # Leave this guy out until he's needed
+        self.mount_b.umount_wait()
+
+        # Create dir + child dentry on client A, and fsync the dir
+        path = os.path.join(self.mount_a.mountpoint, "subdir")
+        self.mount_a.run_python(
+            dedent("""
+                import os
+                import time
+
+                path = "{path}"
+
+                print("Starting creation...")
+                start = time.time()
+
+                os.mkdir(path)
+                dfd = os.open(path, os.O_DIRECTORY)
+
+                fd = open(os.path.join(path, "childfile"), "w")
+                print("Finished creation in {{0}}s".format(time.time() - start))
+
+                print("Starting fsync...")
+                start = time.time()
+                if {dirfsync}:
+                    os.fsync(dfd)
+                else:
+                    os.fsync(fd)
+                print("Finished fsync in {{0}}s".format(time.time() - start))
+            """.format(path=path,dirfsync=str(dirfsync)))
+        )
+
+        # Immediately kill the MDS and then client A
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+        self.mount_a.kill()
+        self.mount_a.kill_cleanup()
+
+        # Restart the MDS.  Wait for it to come up, it'll have to time out in clientreplay
+        self.fs.mds_restart()
+        log.info("Waiting for reconnect...")
+        self.fs.wait_for_state("up:reconnect")
+        log.info("Waiting for active...")
+        self.fs.wait_for_state("up:active", timeout=MDS_RESTART_GRACE + self.mds_reconnect_timeout)
+        log.info("Reached active...")
+
+        # Is the child dentry visible from mount B?
+        self.mount_b.mount()
+        self.mount_b.wait_until_mounted()
+        self.mount_b.run_shell(["ls", "subdir/childfile"])
+
+    def test_unmount_for_evicted_client(self):
+        """Test if client hangs on unmount after evicting the client."""
+        mount_a_client_id = self.mount_a.get_global_id()
+        self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+
+        self.mount_a.umount_wait(require_clean=True, timeout=30)
+
+    def test_stale_renew(self):
+        if not isinstance(self.mount_a, FuseMount):
+            raise SkipTest("Require FUSE client to handle signal STOP/CONT")
+
+        session_timeout = self.fs.get_var("session_timeout")
+
+        self.mount_a.run_shell(["mkdir", "testdir"])
+        self.mount_a.run_shell(["touch", "testdir/file1"])
+        # populate readdir cache
+        self.mount_a.run_shell(["ls", "testdir"])
+        self.mount_b.run_shell(["ls", "testdir"])
+
+        # check if readdir cache is effective
+        initial_readdirs = self.fs.mds_asok(['perf', 'dump', 'mds_server', 'req_readdir_latency'])
+        self.mount_b.run_shell(["ls", "testdir"])
+        current_readdirs = self.fs.mds_asok(['perf', 'dump', 'mds_server', 'req_readdir_latency'])
+        self.assertEqual(current_readdirs, initial_readdirs);
+
+        mount_b_gid = self.mount_b.get_global_id()
+        mount_b_pid = self.mount_b.get_client_pid()
+        # stop ceph-fuse process of mount_b
+        self.mount_b.client_remote.run(args=["sudo", "kill", "-STOP", mount_b_pid])
+
+        self.assert_session_state(mount_b_gid, "open")
+        time.sleep(session_timeout * 1.5)  # Long enough for MDS to consider session stale
+
+        self.mount_a.run_shell(["touch", "testdir/file2"])
+        self.assert_session_state(mount_b_gid, "stale")
+
+        # resume ceph-fuse process of mount_b
+        self.mount_b.client_remote.run(args=["sudo", "kill", "-CONT", mount_b_pid])
+        # Is the new file visible from mount_b? (caps become invalid after session stale)
+        self.mount_b.run_shell(["ls", "testdir/file2"])
+
+    def test_abort_conn(self):
+        """
+        Check that abort_conn() skips closing mds sessions.
+        """
+        if not isinstance(self.mount_a, FuseMount):
+            raise SkipTest("Testing libcephfs function")
+
+        self.fs.mds_asok(['config', 'set', 'mds_defer_session_stale', 'false'])
+        session_timeout = self.fs.get_var("session_timeout")
+
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        gid_str = self.mount_a.run_python(dedent("""
+            import cephfs as libcephfs
+            cephfs = libcephfs.LibCephFS(conffile='')
+            cephfs.mount()
+            client_id = cephfs.get_instance_id()
+            cephfs.abort_conn()
+            print(client_id)
+            """)
+        )
+        gid = int(gid_str);
+
+        self.assert_session_state(gid, "open")
+        time.sleep(session_timeout * 1.5)  # Long enough for MDS to consider session stale
+        self.assert_session_state(gid, "stale")
+
+    def test_dont_mark_unresponsive_client_stale(self):
+        """
+        Test that an unresponsive client holding caps is not marked stale or
+        evicted unless another clients wants its caps.
+        """
+        if not isinstance(self.mount_a, FuseMount):
+            self.skipTest("Require FUSE client to handle signal STOP/CONT")
+
+        # XXX: To conduct this test we need at least two clients since a
+        # single client is never evcited by MDS.
+        SESSION_TIMEOUT = 30
+        SESSION_AUTOCLOSE = 50
+        time_at_beg = time.time()
+        mount_a_gid = self.mount_a.get_global_id()
+        _ = self.mount_a.client_pid
+        self.fs.set_var('session_timeout', SESSION_TIMEOUT)
+        self.fs.set_var('session_autoclose', SESSION_AUTOCLOSE)
+        self.assert_session_count(2, self.fs.mds_asok(['session', 'ls']))
+
+        # test that client holding cap not required by any other client is not
+        # marked stale when it becomes unresponsive.
+        self.mount_a.run_shell(['mkdir', 'dir'])
+        self.mount_a.send_signal('sigstop')
+        time.sleep(SESSION_TIMEOUT + 2)
+        self.assert_session_state(mount_a_gid, "open")
+
+        # test that other clients have to wait to get the caps from
+        # unresponsive client until session_autoclose.
+        self.mount_b.run_shell(['stat', 'dir'])
+        self.assert_session_count(1, self.fs.mds_asok(['session', 'ls']))
+        self.assertLess(time.time(), time_at_beg + SESSION_AUTOCLOSE)
+
+        self.mount_a.send_signal('sigcont')
+
+    def test_config_session_timeout(self):
+        self.fs.mds_asok(['config', 'set', 'mds_defer_session_stale', 'false'])
+        session_timeout = self.fs.get_var("session_timeout")
+        mount_a_gid = self.mount_a.get_global_id()
+
+        self.fs.mds_asok(['session', 'config', '%s' % mount_a_gid, 'timeout', '%s' % (session_timeout * 2)])
+
+        self.mount_a.kill();
+
+        self.assert_session_count(2)
+
+        time.sleep(session_timeout * 1.5)
+        self.assert_session_state(mount_a_gid, "open")
+
+        time.sleep(session_timeout)
+        self.assert_session_count(1)
+
+        self.mount_a.kill_cleanup()
diff --git a/qa/tasks/cephfs/test_damage.py b/qa/tasks/cephfs/test_damage.py
new file mode 100644
index 00000000..d03e027e
--- /dev/null
+++ b/qa/tasks/cephfs/test_damage.py
@@ -0,0 +1,569 @@
+import json
+import logging
+import errno
+import re
+from teuthology.contextutil import MaxWhileTries
+from teuthology.exceptions import CommandFailedError
+from teuthology.orchestra.run import wait
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+
+DAMAGED_ON_START = "damaged_on_start"
+DAMAGED_ON_LS = "damaged_on_ls"
+CRASHED = "server crashed"
+NO_DAMAGE = "no damage"
+READONLY = "readonly"
+FAILED_CLIENT = "client failed"
+FAILED_SERVER = "server failed"
+
+# An EIO in response to a stat from the client
+EIO_ON_LS = "eio"
+
+# An EIO, but nothing in damage table (not ever what we expect)
+EIO_NO_DAMAGE = "eio without damage entry"
+
+
+log = logging.getLogger(__name__)
+
+
+class TestDamage(CephFSTestCase):
+    def _simple_workload_write(self):
+        self.mount_a.run_shell(["mkdir", "subdir"])
+        self.mount_a.write_n_mb("subdir/sixmegs", 6)
+        return self.mount_a.stat("subdir/sixmegs")
+
+    def is_marked_damaged(self, rank):
+        mds_map = self.fs.get_mds_map()
+        return rank in mds_map['damaged']
+
+    @for_teuthology #459s
+    def test_object_deletion(self):
+        """
+        That the MDS has a clean 'damaged' response to loss of any single metadata object
+        """
+
+        self._simple_workload_write()
+
+        # Hmm, actually it would be nice to permute whether the metadata pool
+        # state contains sessions or not, but for the moment close this session
+        # to avoid waiting through reconnect on every MDS start.
+        self.mount_a.umount_wait()
+        for mds_name in self.fs.get_active_names():
+            self.fs.mds_asok(["flush", "journal"], mds_name)
+
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        self.fs.rados(['export', '/tmp/metadata.bin'])
+
+        def is_ignored(obj_id, dentry=None):
+            """
+            A filter to avoid redundantly mutating many similar objects (e.g.
+            stray dirfrags) or similar dentries (e.g. stray dir dentries)
+            """
+            if re.match("60.\.00000000", obj_id) and obj_id != "600.00000000":
+                return True
+
+            if dentry and obj_id == "100.00000000":
+                if re.match("stray.+_head", dentry) and dentry != "stray0_head":
+                    return True
+
+            return False
+
+        def get_path(obj_id, dentry=None):
+            """
+            What filesystem path does this object or dentry correspond to?   i.e.
+            what should I poke to see EIO after damaging it?
+            """
+
+            if obj_id == "1.00000000" and dentry == "subdir_head":
+                return "./subdir"
+            elif obj_id == "10000000000.00000000" and dentry == "sixmegs_head":
+                return "./subdir/sixmegs"
+
+            # None means ls will do an "ls -R" in hope of seeing some errors
+            return None
+
+        objects = self.fs.rados(["ls"]).split("\n")
+        objects = [o for o in objects if not is_ignored(o)]
+
+        # Find all objects with an OMAP header
+        omap_header_objs = []
+        for o in objects:
+            header = self.fs.rados(["getomapheader", o])
+            # The rados CLI wraps the header output in a hex-printed style
+            header_bytes = int(re.match("header \((.+) bytes\)", header).group(1))
+            if header_bytes > 0:
+                omap_header_objs.append(o)
+
+        # Find all OMAP key/vals
+        omap_keys = []
+        for o in objects:
+            keys_str = self.fs.rados(["listomapkeys", o])
+            if keys_str:
+                for key in keys_str.split("\n"):
+                    if not is_ignored(o, key):
+                        omap_keys.append((o, key))
+
+        # Find objects that have data in their bodies
+        data_objects = []
+        for obj_id in objects:
+            stat_out = self.fs.rados(["stat", obj_id])
+            size = int(re.match(".+, size (.+)$", stat_out).group(1))
+            if size > 0:
+                data_objects.append(obj_id)
+
+        # Define the various forms of damage we will inflict
+        class MetadataMutation(object):
+            def __init__(self, obj_id_, desc_, mutate_fn_, expectation_, ls_path=None):
+                self.obj_id = obj_id_
+                self.desc = desc_
+                self.mutate_fn = mutate_fn_
+                self.expectation = expectation_
+                if ls_path is None:
+                    self.ls_path = "."
+                else:
+                    self.ls_path = ls_path
+
+            def __eq__(self, other):
+                return self.desc == other.desc
+
+            def __hash__(self):
+                return hash(self.desc)
+
+        junk = "deadbeef" * 10
+        mutations = []
+
+        # Removals
+        for o in objects:
+            if o in [
+                # JournalPointers are auto-replaced if missing (same path as upgrade)
+                "400.00000000",
+                # Missing dirfrags for non-system dirs result in empty directory
+                "10000000000.00000000",
+                # PurgeQueue is auto-created if not found on startup
+                "500.00000000",
+                # open file table is auto-created if not found on startup
+                "mds0_openfiles.0"
+            ]:
+                expectation = NO_DAMAGE
+            else:
+                expectation = DAMAGED_ON_START
+
+            log.info("Expectation on rm '{0}' will be '{1}'".format(
+                o, expectation
+            ))
+
+            mutations.append(MetadataMutation(
+                o,
+                "Delete {0}".format(o),
+                lambda o=o: self.fs.rados(["rm", o]),
+                expectation
+            ))
+
+        # Blatant corruptions
+        for obj_id in data_objects:
+            if obj_id == "500.00000000":
+                # purge queue corruption results in read-only FS
+                mutations.append(MetadataMutation(
+                    obj_id,
+                    "Corrupt {0}".format(obj_id),
+                    lambda o=obj_id: self.fs.rados(["put", o, "-"], stdin_data=junk),
+                    READONLY
+                ))
+            else:
+                mutations.append(MetadataMutation(
+                    obj_id,
+                    "Corrupt {0}".format(obj_id),
+                    lambda o=obj_id: self.fs.rados(["put", o, "-"], stdin_data=junk),
+                    DAMAGED_ON_START
+                ))
+
+        # Truncations
+        for o in data_objects:
+            if o == "500.00000000":
+                # The PurgeQueue is allowed to be empty: Journaler interprets
+                # an empty header object as an empty journal.
+                expectation = NO_DAMAGE
+            else:
+                expectation = DAMAGED_ON_START
+
+            mutations.append(
+                MetadataMutation(
+                    o,
+                    "Truncate {0}".format(o),
+                    lambda o=o: self.fs.rados(["truncate", o, "0"]),
+                    expectation
+            ))
+
+        # OMAP value corruptions
+        for o, k in omap_keys:
+            if o.startswith("100."):
+                # Anything in rank 0's 'mydir'
+                expectation = DAMAGED_ON_START
+            else:
+                expectation = EIO_ON_LS
+
+            mutations.append(
+                MetadataMutation(
+                    o,
+                    "Corrupt omap key {0}:{1}".format(o, k),
+                    lambda o=o,k=k: self.fs.rados(["setomapval", o, k, junk]),
+                    expectation,
+                    get_path(o, k)
+                )
+            )
+
+        # OMAP header corruptions
+        for o in omap_header_objs:
+            if re.match("60.\.00000000", o) \
+                    or o in ["1.00000000", "100.00000000", "mds0_sessionmap"]:
+                expectation = DAMAGED_ON_START
+            else:
+                expectation = NO_DAMAGE
+
+            log.info("Expectation on corrupt header '{0}' will be '{1}'".format(
+                o, expectation
+            ))
+
+            mutations.append(
+                MetadataMutation(
+                    o,
+                    "Corrupt omap header on {0}".format(o),
+                    lambda o=o: self.fs.rados(["setomapheader", o, junk]),
+                    expectation
+                )
+            )
+
+        results = {}
+
+        for mutation in mutations:
+            log.info("Applying mutation '{0}'".format(mutation.desc))
+
+            # Reset MDS state
+            self.mount_a.umount_wait(force=True)
+            self.fs.mds_stop()
+            self.fs.mds_fail()
+            self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
+
+            # Reset RADOS pool state
+            self.fs.rados(['import', '/tmp/metadata.bin'])
+
+            # Inject the mutation
+            mutation.mutate_fn()
+
+            # Try starting the MDS
+            self.fs.mds_restart()
+
+            # How long we'll wait between starting a daemon and expecting
+            # it to make it through startup, and potentially declare itself
+            # damaged to the mon cluster.
+            startup_timeout = 60
+
+            if mutation.expectation not in (EIO_ON_LS, DAMAGED_ON_LS, NO_DAMAGE):
+                if mutation.expectation == DAMAGED_ON_START:
+                    # The MDS may pass through active before making it to damaged
+                    try:
+                        self.wait_until_true(lambda: self.is_marked_damaged(0), startup_timeout)
+                    except RuntimeError:
+                        pass
+
+                # Wait for MDS to either come up or go into damaged state
+                try:
+                    self.wait_until_true(lambda: self.is_marked_damaged(0) or self.fs.are_daemons_healthy(), startup_timeout)
+                except RuntimeError:
+                    crashed = False
+                    # Didn't make it to healthy or damaged, did it crash?
+                    for daemon_id, daemon in self.fs.mds_daemons.items():
+                        if daemon.proc and daemon.proc.finished:
+                            crashed = True
+                            log.error("Daemon {0} crashed!".format(daemon_id))
+                            daemon.proc = None  # So that subsequent stop() doesn't raise error
+                    if not crashed:
+                        # Didn't go health, didn't go damaged, didn't crash, so what?
+                        raise
+                    else:
+                        log.info("Result: Mutation '{0}' led to crash".format(mutation.desc))
+                        results[mutation] = CRASHED
+                        continue
+                if self.is_marked_damaged(0):
+                    log.info("Result: Mutation '{0}' led to DAMAGED state".format(mutation.desc))
+                    results[mutation] = DAMAGED_ON_START
+                    continue
+                else:
+                    log.info("Mutation '{0}' did not prevent MDS startup, attempting ls...".format(mutation.desc))
+            else:
+                try:
+                    self.wait_until_true(self.fs.are_daemons_healthy, 60)
+                except RuntimeError:
+                    log.info("Result: Mutation '{0}' should have left us healthy, actually not.".format(mutation.desc))
+                    if self.is_marked_damaged(0):
+                        results[mutation] = DAMAGED_ON_START
+                    else:
+                        results[mutation] = FAILED_SERVER
+                    continue
+                log.info("Daemons came up after mutation '{0}', proceeding to ls".format(mutation.desc))
+
+            # MDS is up, should go damaged on ls or client mount
+            self.mount_a.mount()
+            self.mount_a.wait_until_mounted()
+            if mutation.ls_path == ".":
+                proc = self.mount_a.run_shell(["ls", "-R", mutation.ls_path], wait=False)
+            else:
+                proc = self.mount_a.stat(mutation.ls_path, wait=False)
+
+            if mutation.expectation == DAMAGED_ON_LS:
+                try:
+                    self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
+                    log.info("Result: Mutation '{0}' led to DAMAGED state after ls".format(mutation.desc))
+                    results[mutation] = DAMAGED_ON_LS
+                except RuntimeError:
+                    if self.fs.are_daemons_healthy():
+                        log.error("Result: Failed to go damaged on mutation '{0}', actually went active".format(
+                            mutation.desc))
+                        results[mutation] = NO_DAMAGE
+                    else:
+                        log.error("Result: Failed to go damaged on mutation '{0}'".format(mutation.desc))
+                        results[mutation] = FAILED_SERVER
+            elif mutation.expectation == READONLY:
+                proc = self.mount_a.run_shell(["mkdir", "foo"], wait=False)
+                try:
+                    proc.wait()
+                except CommandFailedError:
+                    stderr = proc.stderr.getvalue()
+                    log.info(stderr)
+                    if "Read-only file system".lower() in stderr.lower():
+                        pass
+                    else:
+                        raise
+            else:
+                try:
+                    wait([proc], 20)
+                    log.info("Result: Mutation '{0}' did not caused DAMAGED state".format(mutation.desc))
+                    results[mutation] = NO_DAMAGE
+                except MaxWhileTries:
+                    log.info("Result: Failed to complete client IO on mutation '{0}'".format(mutation.desc))
+                    results[mutation] = FAILED_CLIENT
+                except CommandFailedError as e:
+                    if e.exitstatus == errno.EIO:
+                        log.info("Result: EIO on client")
+                        results[mutation] = EIO_ON_LS
+                    else:
+                        log.info("Result: unexpected error {0} on client".format(e))
+                        results[mutation] = FAILED_CLIENT
+
+            if mutation.expectation == EIO_ON_LS:
+                # EIOs mean something handled by DamageTable: assert that it has
+                # been populated
+                damage = json.loads(
+                    self.fs.mon_manager.raw_cluster_cmd(
+                        'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), "damage", "ls", '--format=json-pretty'))
+                if len(damage) == 0:
+                    results[mutation] = EIO_NO_DAMAGE
+
+        failures = [(mutation, result) for (mutation, result) in results.items() if mutation.expectation != result]
+        if failures:
+            log.error("{0} mutations had unexpected outcomes:".format(len(failures)))
+            for mutation, result in failures:
+                log.error("  Expected '{0}' actually '{1}' from '{2}'".format(
+                    mutation.expectation, result, mutation.desc
+                ))
+            raise RuntimeError("{0} mutations had unexpected outcomes".format(len(failures)))
+        else:
+            log.info("All {0} mutations had expected outcomes".format(len(mutations)))
+
+    def test_damaged_dentry(self):
+        # Damage to dentrys is interesting because it leaves the
+        # directory's `complete` flag in a subtle state where
+        # we have marked the dir complete in order that folks
+        # can access it, but in actual fact there is a dentry
+        # missing
+        self.mount_a.run_shell(["mkdir", "subdir/"])
+
+        self.mount_a.run_shell(["touch", "subdir/file_undamaged"])
+        self.mount_a.run_shell(["touch", "subdir/file_to_be_damaged"])
+
+        subdir_ino = self.mount_a.path_to_ino("subdir")
+
+        self.mount_a.umount_wait()
+        for mds_name in self.fs.get_active_names():
+            self.fs.mds_asok(["flush", "journal"], mds_name)
+
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # Corrupt a dentry
+        junk = "deadbeef" * 10
+        dirfrag_obj = "{0:x}.00000000".format(subdir_ino)
+        self.fs.rados(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])
+
+        # Start up and try to list it
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        dentries = self.mount_a.ls("subdir/")
+
+        # The damaged guy should have disappeared
+        self.assertEqual(dentries, ["file_undamaged"])
+
+        # I should get ENOENT if I try and read it normally, because
+        # the dir is considered complete
+        try:
+            self.mount_a.stat("subdir/file_to_be_damaged", wait=True)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.ENOENT)
+        else:
+            raise AssertionError("Expected ENOENT")
+
+        # The fact that there is damaged should have bee recorded
+        damage = json.loads(
+            self.fs.mon_manager.raw_cluster_cmd(
+                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
+                "damage", "ls", '--format=json-pretty'))
+        self.assertEqual(len(damage), 1)
+        damage_id = damage[0]['id']
+
+        # If I try to create a dentry with the same name as the damaged guy
+        # then that should be forbidden
+        try:
+            self.mount_a.touch("subdir/file_to_be_damaged")
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.EIO)
+        else:
+            raise AssertionError("Expected EIO")
+
+        # Attempting that touch will clear the client's complete flag, now
+        # when I stat it I'll get EIO instead of ENOENT
+        try:
+            self.mount_a.stat("subdir/file_to_be_damaged", wait=True)
+        except CommandFailedError as e:
+            if isinstance(self.mount_a, FuseMount):
+                self.assertEqual(e.exitstatus, errno.EIO)
+            else:
+                # Kernel client handles this case differently
+                self.assertEqual(e.exitstatus, errno.ENOENT)
+        else:
+            raise AssertionError("Expected EIO")
+
+        nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files")
+        self.assertEqual(nfiles, "2")
+
+        self.mount_a.umount_wait()
+
+        # Now repair the stats
+        scrub_json = self.fs.rank_tell(["scrub", "start", "/subdir", "repair"])
+        log.info(json.dumps(scrub_json, indent=2))
+
+        self.assertEqual(scrub_json["passed_validation"], False)
+        self.assertEqual(scrub_json["raw_stats"]["checked"], True)
+        self.assertEqual(scrub_json["raw_stats"]["passed"], False)
+
+        # Check that the file count is now correct
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files")
+        self.assertEqual(nfiles, "1")
+
+        # Clean up the omap object
+        self.fs.rados(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])
+
+        # Clean up the damagetable entry
+        self.fs.mon_manager.raw_cluster_cmd(
+            'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
+            "damage", "rm", "{did}".format(did=damage_id))
+
+        # Now I should be able to create a file with the same name as the
+        # damaged guy if I want.
+        self.mount_a.touch("subdir/file_to_be_damaged")
+
+    def test_open_ino_errors(self):
+        """
+        That errors encountered during opening inos are properly propagated
+        """
+
+        self.mount_a.run_shell(["mkdir", "dir1"])
+        self.mount_a.run_shell(["touch", "dir1/file1"])
+        self.mount_a.run_shell(["mkdir", "dir2"])
+        self.mount_a.run_shell(["touch", "dir2/file2"])
+        self.mount_a.run_shell(["mkdir", "testdir"])
+        self.mount_a.run_shell(["ln", "dir1/file1", "testdir/hardlink1"])
+        self.mount_a.run_shell(["ln", "dir2/file2", "testdir/hardlink2"])
+
+        file1_ino = self.mount_a.path_to_ino("dir1/file1")
+        file2_ino = self.mount_a.path_to_ino("dir2/file2")
+        dir2_ino = self.mount_a.path_to_ino("dir2")
+
+        # Ensure everything is written to backing store
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(["flush", "journal"])
+
+        # Drop everything from the MDS cache
+        self.mds_cluster.mds_stop()
+        self.fs.journal_tool(['journal', 'reset'], 0)
+        self.mds_cluster.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        self.mount_a.mount()
+
+        # Case 1: un-decodeable backtrace
+
+        # Validate that the backtrace is present and decodable
+        self.fs.read_backtrace(file1_ino)
+        # Go corrupt the backtrace of alpha/target (used for resolving
+        # bravo/hardlink).
+        self.fs._write_data_xattr(file1_ino, "parent", "rhubarb")
+
+        # Check that touching the hardlink gives EIO
+        ran = self.mount_a.run_shell(["stat", "testdir/hardlink1"], wait=False)
+        try:
+            ran.wait()
+        except CommandFailedError:
+            self.assertTrue("Input/output error" in ran.stderr.getvalue())
+
+        # Check that an entry is created in the damage table
+        damage = json.loads(
+            self.fs.mon_manager.raw_cluster_cmd(
+                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
+                "damage", "ls", '--format=json-pretty'))
+        self.assertEqual(len(damage), 1)
+        self.assertEqual(damage[0]['damage_type'], "backtrace")
+        self.assertEqual(damage[0]['ino'], file1_ino)
+
+        self.fs.mon_manager.raw_cluster_cmd(
+            'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
+            "damage", "rm", str(damage[0]['id']))
+
+
+        # Case 2: missing dirfrag for the target inode
+
+        self.fs.rados(["rm", "{0:x}.00000000".format(dir2_ino)])
+
+        # Check that touching the hardlink gives EIO
+        ran = self.mount_a.run_shell(["stat", "testdir/hardlink2"], wait=False)
+        try:
+            ran.wait()
+        except CommandFailedError:
+            self.assertTrue("Input/output error" in ran.stderr.getvalue())
+
+        # Check that an entry is created in the damage table
+        damage = json.loads(
+            self.fs.mon_manager.raw_cluster_cmd(
+                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
+                "damage", "ls", '--format=json-pretty'))
+        self.assertEqual(len(damage), 2)
+        if damage[0]['damage_type'] == "backtrace" :
+            self.assertEqual(damage[0]['ino'], file2_ino)
+            self.assertEqual(damage[1]['damage_type'], "dir_frag")
+            self.assertEqual(damage[1]['ino'], dir2_ino)
+        else:
+            self.assertEqual(damage[0]['damage_type'], "dir_frag")
+            self.assertEqual(damage[0]['ino'], dir2_ino)
+            self.assertEqual(damage[1]['damage_type'], "backtrace")
+            self.assertEqual(damage[1]['ino'], file2_ino)
+
+        for entry in damage:
+            self.fs.mon_manager.raw_cluster_cmd(
+                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
+                "damage", "rm", str(entry['id']))
diff --git a/qa/tasks/cephfs/test_data_scan.py b/qa/tasks/cephfs/test_data_scan.py
new file mode 100644
index 00000000..cbd5109a
--- /dev/null
+++ b/qa/tasks/cephfs/test_data_scan.py
@@ -0,0 +1,695 @@
+
+"""
+Test our tools for recovering metadata from the data pool
+"""
+import json
+
+import logging
+import os
+import time
+import traceback
+
+from io import BytesIO
+from collections import namedtuple, defaultdict
+from textwrap import dedent
+
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+
+log = logging.getLogger(__name__)
+
+
+ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
+
+
+class Workload(object):
+    def __init__(self, filesystem, mount):
+        self._mount = mount
+        self._filesystem = filesystem
+        self._initial_state = None
+
+        # Accumulate backtraces for every failed validation, and return them.  Backtraces
+        # are rather verbose, but we only see them when something breaks, and they
+        # let us see which check failed without having to decorate each check with
+        # a string
+        self._errors = []
+
+    def assert_equal(self, a, b):
+        try:
+            if a != b:
+                raise AssertionError("{0} != {1}".format(a, b))
+        except AssertionError as e:
+            self._errors.append(
+                ValidationError(e, traceback.format_exc(3))
+            )
+
+    def write(self):
+        """
+        Write the workload files to the mount
+        """
+        raise NotImplementedError()
+
+    def validate(self):
+        """
+        Read from the mount and validate that the workload files are present (i.e. have
+        survived or been reconstructed from the test scenario)
+        """
+        raise NotImplementedError()
+
+    def damage(self):
+        """
+        Damage the filesystem pools in ways that will be interesting to recover from.  By
+        default just wipe everything in the metadata pool
+        """
+        # Delete every object in the metadata pool
+        objects = self._filesystem.rados(["ls"]).split("\n")
+        for o in objects:
+            self._filesystem.rados(["rm", o])
+
+    def flush(self):
+        """
+        Called after client unmount, after write: flush whatever you want
+        """
+        self._filesystem.mds_asok(["flush", "journal"])
+
+
+class SimpleWorkload(Workload):
+    """
+    Single file, single directory, check that it gets recovered and so does its size
+    """
+    def write(self):
+        self._mount.run_shell(["mkdir", "subdir"])
+        self._mount.write_n_mb("subdir/sixmegs", 6)
+        self._initial_state = self._mount.stat("subdir/sixmegs")
+
+    def validate(self):
+        self._mount.run_shell(["ls", "subdir"])
+        st = self._mount.stat("subdir/sixmegs")
+        self.assert_equal(st['st_size'], self._initial_state['st_size'])
+        return self._errors
+
+
+class MovedFile(Workload):
+    def write(self):
+        # Create a file whose backtrace disagrees with his eventual position
+        # in the metadata.  We will see that he gets reconstructed in his
+        # original position according to his backtrace.
+        self._mount.run_shell(["mkdir", "subdir_alpha"])
+        self._mount.run_shell(["mkdir", "subdir_bravo"])
+        self._mount.write_n_mb("subdir_alpha/sixmegs", 6)
+        self._filesystem.mds_asok(["flush", "journal"])
+        self._mount.run_shell(["mv", "subdir_alpha/sixmegs", "subdir_bravo/sixmegs"])
+        self._initial_state = self._mount.stat("subdir_bravo/sixmegs")
+
+    def flush(self):
+        pass
+
+    def validate(self):
+        self.assert_equal(self._mount.ls(), ["subdir_alpha"])
+        st = self._mount.stat("subdir_alpha/sixmegs")
+        self.assert_equal(st['st_size'], self._initial_state['st_size'])
+        return self._errors
+
+
+class BacktracelessFile(Workload):
+    def write(self):
+        self._mount.run_shell(["mkdir", "subdir"])
+        self._mount.write_n_mb("subdir/sixmegs", 6)
+        self._initial_state = self._mount.stat("subdir/sixmegs")
+
+    def flush(self):
+        # Never flush metadata, so backtrace won't be written
+        pass
+
+    def validate(self):
+        ino_name = "%x" % self._initial_state["st_ino"]
+
+        # The inode should be linked into lost+found because we had no path for it
+        self.assert_equal(self._mount.ls(), ["lost+found"])
+        self.assert_equal(self._mount.ls("lost+found"), [ino_name])
+        st = self._mount.stat("lost+found/{ino_name}".format(ino_name=ino_name))
+
+        # We might not have got the name or path, but we should still get the size
+        self.assert_equal(st['st_size'], self._initial_state['st_size'])
+
+        return self._errors
+
+
+class StripedStashedLayout(Workload):
+    def __init__(self, fs, m):
+        super(StripedStashedLayout, self).__init__(fs, m)
+
+        # Nice small stripes so we can quickly do our writes+validates
+        self.sc = 4
+        self.ss = 65536
+        self.os = 262144
+
+        self.interesting_sizes = [
+            # Exactly stripe_count objects will exist
+            self.os * self.sc,
+            # Fewer than stripe_count objects will exist
+            self.os * self.sc // 2,
+            self.os * (self.sc - 1) + self.os // 2,
+            self.os * (self.sc - 1) + self.os // 2 - 1,
+            self.os * (self.sc + 1) + self.os // 2,
+            self.os * (self.sc + 1) + self.os // 2 + 1,
+            # More than stripe_count objects will exist
+            self.os * self.sc + self.os * self.sc // 2
+        ]
+
+    def write(self):
+        # Create a dir with a striped layout set on it
+        self._mount.run_shell(["mkdir", "stripey"])
+
+        self._mount.setfattr("./stripey", "ceph.dir.layout",
+             "stripe_unit={ss} stripe_count={sc} object_size={os} pool={pool}".format(
+                 ss=self.ss, os=self.os, sc=self.sc,
+                 pool=self._filesystem.get_data_pool_name()
+             ))
+
+        # Write files, then flush metadata so that its layout gets written into an xattr
+        for i, n_bytes in enumerate(self.interesting_sizes):
+            self._mount.write_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes)
+            # This is really just validating the validator
+            self._mount.validate_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes)
+        self._filesystem.mds_asok(["flush", "journal"])
+
+        # Write another file in the same way, but this time don't flush the metadata,
+        # so that it won't have the layout xattr
+        self._mount.write_test_pattern("stripey/unflushed_file", 1024 * 512)
+        self._mount.validate_test_pattern("stripey/unflushed_file", 1024 * 512)
+
+        self._initial_state = {
+            "unflushed_ino": self._mount.path_to_ino("stripey/unflushed_file")
+        }
+
+    def flush(self):
+        # Pass because we already selectively flushed during write
+        pass
+
+    def validate(self):
+        # The first files should have been recovered into its original location
+        # with the correct layout: read back correct data
+        for i, n_bytes in enumerate(self.interesting_sizes):
+            try:
+                self._mount.validate_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes)
+            except CommandFailedError as e:
+                self._errors.append(
+                    ValidationError("File {0} (size {1}): {2}".format(i, n_bytes, e), traceback.format_exc(3))
+                )
+
+        # The unflushed file should have been recovered into lost+found without
+        # the correct layout: read back junk
+        ino_name = "%x" % self._initial_state["unflushed_ino"]
+        self.assert_equal(self._mount.ls("lost+found"), [ino_name])
+        try:
+            self._mount.validate_test_pattern(os.path.join("lost+found", ino_name), 1024 * 512)
+        except CommandFailedError:
+            pass
+        else:
+            self._errors.append(
+                ValidationError("Unexpectedly valid data in unflushed striped file", "")
+            )
+
+        return self._errors
+
+
+class ManyFilesWorkload(Workload):
+    def __init__(self, filesystem, mount, file_count):
+        super(ManyFilesWorkload, self).__init__(filesystem, mount)
+        self.file_count = file_count
+
+    def write(self):
+        self._mount.run_shell(["mkdir", "subdir"])
+        for n in range(0, self.file_count):
+            self._mount.write_test_pattern("subdir/{0}".format(n), 6 * 1024 * 1024)
+
+    def validate(self):
+        for n in range(0, self.file_count):
+            try:
+                self._mount.validate_test_pattern("subdir/{0}".format(n), 6 * 1024 * 1024)
+            except CommandFailedError as e:
+                self._errors.append(
+                    ValidationError("File {0}: {1}".format(n, e), traceback.format_exc(3))
+                )
+
+        return self._errors
+
+
+class MovedDir(Workload):
+    def write(self):
+        # Create a nested dir that we will then move.  Two files with two different
+        # backtraces referring to the moved dir, claiming two different locations for
+        # it.  We will see that only one backtrace wins and the dir ends up with
+        # single linkage.
+        self._mount.run_shell(["mkdir", "-p", "grandmother/parent"])
+        self._mount.write_n_mb("grandmother/parent/orig_pos_file", 1)
+        self._filesystem.mds_asok(["flush", "journal"])
+        self._mount.run_shell(["mkdir", "grandfather"])
+        self._mount.run_shell(["mv", "grandmother/parent", "grandfather"])
+        self._mount.write_n_mb("grandfather/parent/new_pos_file", 2)
+        self._filesystem.mds_asok(["flush", "journal"])
+
+        self._initial_state = (
+            self._mount.stat("grandfather/parent/orig_pos_file"),
+            self._mount.stat("grandfather/parent/new_pos_file")
+        )
+
+    def validate(self):
+        root_files = self._mount.ls()
+        self.assert_equal(len(root_files), 1)
+        self.assert_equal(root_files[0] in ["grandfather", "grandmother"], True)
+        winner = root_files[0]
+        st_opf = self._mount.stat("{0}/parent/orig_pos_file".format(winner))
+        st_npf = self._mount.stat("{0}/parent/new_pos_file".format(winner))
+
+        self.assert_equal(st_opf['st_size'], self._initial_state[0]['st_size'])
+        self.assert_equal(st_npf['st_size'], self._initial_state[1]['st_size'])
+
+
+class MissingZerothObject(Workload):
+    def write(self):
+        self._mount.run_shell(["mkdir", "subdir"])
+        self._mount.write_n_mb("subdir/sixmegs", 6)
+        self._initial_state = self._mount.stat("subdir/sixmegs")
+
+    def damage(self):
+        super(MissingZerothObject, self).damage()
+        zeroth_id = "{0:x}.00000000".format(self._initial_state['st_ino'])
+        self._filesystem.rados(["rm", zeroth_id], pool=self._filesystem.get_data_pool_name())
+
+    def validate(self):
+        st = self._mount.stat("lost+found/{0:x}".format(self._initial_state['st_ino']))
+        self.assert_equal(st['st_size'], self._initial_state['st_size'])
+
+
+class NonDefaultLayout(Workload):
+    """
+    Check that the reconstruction copes with files that have a different
+    object size in their layout
+    """
+    def write(self):
+        self._mount.run_shell(["touch", "datafile"])
+        self._mount.setfattr("./datafile", "ceph.file.layout.object_size", "8388608")
+        self._mount.run_shell(["dd", "if=/dev/urandom", "of=./datafile", "bs=1M", "count=32"])
+        self._initial_state = self._mount.stat("datafile")
+
+    def validate(self):
+        # Check we got the layout reconstructed properly
+        object_size = int(self._mount.getfattr(
+            "./datafile", "ceph.file.layout.object_size"))
+        self.assert_equal(object_size, 8388608)
+
+        # Check we got the file size reconstructed properly
+        st = self._mount.stat("datafile")
+        self.assert_equal(st['st_size'], self._initial_state['st_size'])
+
+
+class TestDataScan(CephFSTestCase):
+    MDSS_REQUIRED = 2
+
+    def is_marked_damaged(self, rank):
+        mds_map = self.fs.get_mds_map()
+        return rank in mds_map['damaged']
+
+    def _rebuild_metadata(self, workload, workers=1):
+        """
+        That when all objects in metadata pool are removed, we can rebuild a metadata pool
+        based on the contents of a data pool, and a client can see and read our files.
+        """
+
+        # First, inject some files
+
+        workload.write()
+
+        # Unmount the client and flush the journal: the tool should also cope with
+        # situations where there is dirty metadata, but we'll test that separately
+        self.mount_a.umount_wait()
+        workload.flush()
+
+        # Stop the MDS
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # After recovery, we need the MDS to not be strict about stats (in production these options
+        # are off by default, but in QA we need to explicitly disable them)
+        self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
+        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
+
+        # Apply any data damage the workload wants
+        workload.damage()
+
+        # Reset the MDS map in case multiple ranks were in play: recovery procedure
+        # only understands how to rebuild metadata under rank 0
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
+                '--yes-i-really-mean-it')
+
+        self.fs.mds_restart()
+
+        def get_state(mds_id):
+            info = self.mds_cluster.get_mds_info(mds_id)
+            return info['state'] if info is not None else None
+
+        self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
+        for mds_id in self.fs.mds_ids:
+            self.wait_until_equal(
+                    lambda: get_state(mds_id),
+                    "up:standby",
+                    timeout=60)
+
+        self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
+        self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
+        self.fs.table_tool([self.fs.name + ":0", "reset", "inode"])
+
+        # Run the recovery procedure
+        if False:
+            with self.assertRaises(CommandFailedError):
+                # Normal reset should fail when no objects are present, we'll use --force instead
+                self.fs.journal_tool(["journal", "reset"], 0)
+
+        self.fs.journal_tool(["journal", "reset", "--force"], 0)
+        self.fs.data_scan(["init"])
+        self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()], worker_count=workers)
+        self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()], worker_count=workers)
+
+        # Mark the MDS repaired
+        self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
+
+        # Start the MDS
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+        log.info(str(self.mds_cluster.status()))
+
+        # Mount a client
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        # See that the files are present and correct
+        errors = workload.validate()
+        if errors:
+            log.error("Validation errors found: {0}".format(len(errors)))
+            for e in errors:
+                log.error(e.exception)
+                log.error(e.backtrace)
+            raise AssertionError("Validation failed, first error: {0}\n{1}".format(
+                errors[0].exception, errors[0].backtrace
+            ))
+
+    def test_rebuild_simple(self):
+        self._rebuild_metadata(SimpleWorkload(self.fs, self.mount_a))
+
+    def test_rebuild_moved_file(self):
+        self._rebuild_metadata(MovedFile(self.fs, self.mount_a))
+
+    def test_rebuild_backtraceless(self):
+        self._rebuild_metadata(BacktracelessFile(self.fs, self.mount_a))
+
+    def test_rebuild_moved_dir(self):
+        self._rebuild_metadata(MovedDir(self.fs, self.mount_a))
+
+    def test_rebuild_missing_zeroth(self):
+        self._rebuild_metadata(MissingZerothObject(self.fs, self.mount_a))
+
+    def test_rebuild_nondefault_layout(self):
+        self._rebuild_metadata(NonDefaultLayout(self.fs, self.mount_a))
+
+    def test_stashed_layout(self):
+        self._rebuild_metadata(StripedStashedLayout(self.fs, self.mount_a))
+
+    def _dirfrag_keys(self, object_id):
+        keys_str = self.fs.rados(["listomapkeys", object_id])
+        if keys_str:
+            return keys_str.split("\n")
+        else:
+            return []
+
+    def test_fragmented_injection(self):
+        """
+        That when injecting a dentry into a fragmented directory, we put it in the right fragment.
+        """
+
+        file_count = 100
+        file_names = ["%s" % n for n in range(0, file_count)]
+
+        # Make sure and disable dirfrag auto merging and splitting
+        self.fs.set_ceph_conf('mds', 'mds bal merge size', 0)
+        self.fs.set_ceph_conf('mds', 'mds bal split size', 100 * file_count)
+
+        # Create a directory of `file_count` files, each named after its
+        # decimal number and containing the string of its decimal number
+        self.mount_a.run_python(dedent("""
+        import os
+        path = os.path.join("{path}", "subdir")
+        os.mkdir(path)
+        for n in range(0, {file_count}):
+            open(os.path.join(path, "%s" % n), 'w').write("%s" % n)
+        """.format(
+            path=self.mount_a.mountpoint,
+            file_count=file_count
+        )))
+
+        dir_ino = self.mount_a.path_to_ino("subdir")
+
+        # Only one MDS should be active!
+        self.assertEqual(len(self.fs.get_active_names()), 1)
+
+        # Ensure that one directory is fragmented
+        mds_id = self.fs.get_active_names()[0]
+        self.fs.mds_asok(["dirfrag", "split", "/subdir", "0/0", "1"], mds_id)
+
+        # Flush journal and stop MDS
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(["flush", "journal"], mds_id)
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # Pick a dentry and wipe out its key
+        # Because I did a 1 bit split, I know one frag will be named <inode>.01000000
+        frag_obj_id = "{0:x}.01000000".format(dir_ino)
+        keys = self._dirfrag_keys(frag_obj_id)
+        victim_key = keys[7]  # arbitrary choice
+        log.info("victim_key={0}".format(victim_key))
+        victim_dentry = victim_key.split("_head")[0]
+        self.fs.rados(["rmomapkey", frag_obj_id, victim_key])
+
+        # Start filesystem back up, observe that the file appears to be gone in an `ls`
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        files = self.mount_a.run_shell(["ls", "subdir/"]).stdout.getvalue().strip().split("\n")
+        self.assertListEqual(sorted(files), sorted(list(set(file_names) - set([victim_dentry]))))
+
+        # Stop the filesystem
+        self.mount_a.umount_wait()
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # Run data-scan, observe that it inserts our dentry back into the correct fragment
+        # by checking the omap now has the dentry's key again
+        self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
+        self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()])
+        self.fs.data_scan(["scan_links"])
+        self.assertIn(victim_key, self._dirfrag_keys(frag_obj_id))
+
+        # Start the filesystem and check that the dentry we deleted is now once again visible
+        # and points to the correct file data.
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        out = self.mount_a.run_shell(["cat", "subdir/{0}".format(victim_dentry)]).stdout.getvalue().strip()
+        self.assertEqual(out, victim_dentry)
+
+        # Finally, close the loop by checking our injected dentry survives a merge
+        mds_id = self.fs.get_active_names()[0]
+        self.mount_a.ls("subdir")  # Do an ls to ensure both frags are in cache so the merge will work
+        self.fs.mds_asok(["dirfrag", "merge", "/subdir", "0/0"], mds_id)
+        self.fs.mds_asok(["flush", "journal"], mds_id)
+        frag_obj_id = "{0:x}.00000000".format(dir_ino)
+        keys = self._dirfrag_keys(frag_obj_id)
+        self.assertListEqual(sorted(keys), sorted(["%s_head" % f for f in file_names]))
+
+        # run scrub to update and make sure rstat.rbytes info in subdir inode and dirfrag
+        # are matched
+        out_json = self.fs.rank_tell(["scrub", "start", "/subdir", "repair", "recursive"])
+        self.assertNotEqual(out_json, None)
+
+        # Remove the whole 'sudbdir' directory
+        self.mount_a.run_shell(["rm", "-rf", "subdir/"])
+
+    @for_teuthology
+    def test_parallel_execution(self):
+        self._rebuild_metadata(ManyFilesWorkload(self.fs, self.mount_a, 25), workers=7)
+
+    def test_pg_files(self):
+        """
+        That the pg files command tells us which files are associated with
+        a particular PG
+        """
+        file_count = 20
+        self.mount_a.run_shell(["mkdir", "mydir"])
+        self.mount_a.create_n_files("mydir/myfile", file_count)
+
+        # Some files elsewhere in the system that we will ignore
+        # to check that the tool is filtering properly
+        self.mount_a.run_shell(["mkdir", "otherdir"])
+        self.mount_a.create_n_files("otherdir/otherfile", file_count)
+
+        pgs_to_files = defaultdict(list)
+        # Rough (slow) reimplementation of the logic
+        for i in range(0, file_count):
+            file_path = "mydir/myfile_{0}".format(i)
+            ino = self.mount_a.path_to_ino(file_path)
+            obj = "{0:x}.{1:08x}".format(ino, 0)
+            pgid = json.loads(self.fs.mon_manager.raw_cluster_cmd(
+                "osd", "map", self.fs.get_data_pool_name(), obj,
+                "--format=json-pretty"
+            ))['pgid']
+            pgs_to_files[pgid].append(file_path)
+            log.info("{0}: {1}".format(file_path, pgid))
+
+        pg_count = self.fs.pgs_per_fs_pool
+        for pg_n in range(0, pg_count):
+            pg_str = "{0}.{1}".format(self.fs.get_data_pool_id(), pg_n)
+            out = self.fs.data_scan(["pg_files", "mydir", pg_str])
+            lines = [l for l in out.split("\n") if l]
+            log.info("{0}: {1}".format(pg_str, lines))
+            self.assertSetEqual(set(lines), set(pgs_to_files[pg_str]))
+
+    def test_rebuild_linkage(self):
+        """
+        The scan_links command fixes linkage errors
+        """
+        self.mount_a.run_shell(["mkdir", "testdir1"])
+        self.mount_a.run_shell(["mkdir", "testdir2"])
+        dir1_ino = self.mount_a.path_to_ino("testdir1")
+        dir2_ino = self.mount_a.path_to_ino("testdir2")
+        dirfrag1_oid = "{0:x}.00000000".format(dir1_ino)
+        dirfrag2_oid = "{0:x}.00000000".format(dir2_ino)
+
+        self.mount_a.run_shell(["touch", "testdir1/file1"])
+        self.mount_a.run_shell(["ln", "testdir1/file1", "testdir1/link1"])
+        self.mount_a.run_shell(["ln", "testdir1/file1", "testdir2/link2"])
+
+        mds_id = self.fs.get_active_names()[0]
+        self.fs.mds_asok(["flush", "journal"], mds_id)
+
+        dirfrag1_keys = self._dirfrag_keys(dirfrag1_oid)
+
+        # introduce duplicated primary link
+        file1_key = "file1_head"
+        self.assertIn(file1_key, dirfrag1_keys)
+        file1_omap_data = self.fs.rados(["getomapval", dirfrag1_oid, file1_key, '-'],
+                                        stdout_data=BytesIO())
+        self.fs.rados(["setomapval", dirfrag2_oid, file1_key], stdin_data=file1_omap_data)
+        self.assertIn(file1_key, self._dirfrag_keys(dirfrag2_oid))
+
+        # remove a remote link, make inode link count incorrect
+        link1_key = 'link1_head'
+        self.assertIn(link1_key, dirfrag1_keys)
+        self.fs.rados(["rmomapkey", dirfrag1_oid, link1_key])
+
+        # increase good primary link's version
+        self.mount_a.run_shell(["touch", "testdir1/file1"])
+        self.mount_a.umount_wait()
+
+        self.fs.mds_asok(["flush", "journal"], mds_id)
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # repair linkage errors
+        self.fs.data_scan(["scan_links"])
+
+        # primary link in testdir2 was deleted?
+        self.assertNotIn(file1_key, self._dirfrag_keys(dirfrag2_oid))
+
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        # link count was adjusted?
+        file1_nlink = self.mount_a.path_to_nlink("testdir1/file1")
+        self.assertEqual(file1_nlink, 2)
+
+    def test_rebuild_inotable(self):
+        """
+        The scan_links command repair inotables
+        """
+        self.fs.set_max_mds(2)
+        self.fs.wait_for_daemons()
+
+        active_mds_names = self.fs.get_active_names()
+        mds0_id = active_mds_names[0]
+        mds1_id = active_mds_names[1]
+
+        self.mount_a.run_shell(["mkdir", "dir1"])
+        dir_ino = self.mount_a.path_to_ino("dir1")
+        self.mount_a.setfattr("dir1", "ceph.dir.pin", "1")
+        # wait for subtree migration
+
+        file_ino = 0;
+        while True:
+            time.sleep(1)
+            # allocate an inode from mds.1
+            self.mount_a.run_shell(["touch", "dir1/file1"])
+            file_ino = self.mount_a.path_to_ino("dir1/file1")
+            if file_ino >= (2 << 40):
+                break
+            self.mount_a.run_shell(["rm", "-f", "dir1/file1"])
+
+        self.mount_a.umount_wait()
+
+        self.fs.mds_asok(["flush", "journal"], mds0_id)
+        self.fs.mds_asok(["flush", "journal"], mds1_id)
+        self.mds_cluster.mds_stop()
+
+        self.fs.rados(["rm", "mds0_inotable"])
+        self.fs.rados(["rm", "mds1_inotable"])
+
+        self.fs.data_scan(["scan_links", "--filesystem", self.fs.name])
+
+        mds0_inotable = json.loads(self.fs.table_tool([self.fs.name + ":0", "show", "inode"]))
+        self.assertGreaterEqual(
+            mds0_inotable['0']['data']['inotable']['free'][0]['start'], dir_ino)
+
+        mds1_inotable = json.loads(self.fs.table_tool([self.fs.name + ":1", "show", "inode"]))
+        self.assertGreaterEqual(
+            mds1_inotable['1']['data']['inotable']['free'][0]['start'], file_ino)
+
+    def test_rebuild_snaptable(self):
+        """
+        The scan_links command repair snaptable
+        """
+        self.fs.set_allow_new_snaps(True)
+
+        self.mount_a.run_shell(["mkdir", "dir1"])
+        self.mount_a.run_shell(["mkdir", "dir1/.snap/s1"])
+        self.mount_a.run_shell(["mkdir", "dir1/.snap/s2"])
+        self.mount_a.run_shell(["rmdir", "dir1/.snap/s2"])
+
+        self.mount_a.umount_wait()
+
+        mds0_id = self.fs.get_active_names()[0]
+        self.fs.mds_asok(["flush", "journal"], mds0_id)
+
+        # wait for mds to update removed snaps
+        time.sleep(10)
+
+        old_snaptable = json.loads(self.fs.table_tool([self.fs.name + ":0", "show", "snap"]))
+        # stamps may have minor difference
+        for item in old_snaptable['snapserver']['snaps']:
+            del item['stamp']
+
+        self.fs.rados(["rm", "mds_snaptable"])
+        self.fs.data_scan(["scan_links", "--filesystem", self.fs.name])
+
+        new_snaptable = json.loads(self.fs.table_tool([self.fs.name + ":0", "show", "snap"]))
+        for item in new_snaptable['snapserver']['snaps']:
+            del item['stamp']
+        self.assertGreaterEqual(
+            new_snaptable['snapserver']['last_snap'], old_snaptable['snapserver']['last_snap'])
+        self.assertEqual(
+            new_snaptable['snapserver']['snaps'], old_snaptable['snapserver']['snaps'])
diff --git a/qa/tasks/cephfs/test_dump_tree.py b/qa/tasks/cephfs/test_dump_tree.py
new file mode 100644
index 00000000..48a2c6f0
--- /dev/null
+++ b/qa/tasks/cephfs/test_dump_tree.py
@@ -0,0 +1,66 @@
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+import random
+import os
+
+class TestDumpTree(CephFSTestCase):
+    def get_paths_to_ino(self):
+        inos = {}
+        p = self.mount_a.run_shell(["find", "./"])
+        paths = p.stdout.getvalue().strip().split()
+        for path in paths:
+            inos[path] = self.mount_a.path_to_ino(path, False)
+
+        return inos
+
+    def populate(self):
+        self.mount_a.run_shell(["git", "clone",
+                                "https://github.com/ceph/ceph-qa-suite"])
+
+    def test_basic(self):
+        self.mount_a.run_shell(["mkdir", "parent"])
+        self.mount_a.run_shell(["mkdir", "parent/child"])
+        self.mount_a.run_shell(["touch", "parent/child/file"])
+        self.mount_a.run_shell(["mkdir", "parent/child/grandchild"])
+        self.mount_a.run_shell(["touch", "parent/child/grandchild/file"])
+
+        inos = self.get_paths_to_ino()
+        tree = self.fs.mds_asok(["dump", "tree", "/parent/child", "1"])
+
+        target_inos = [inos["./parent/child"], inos["./parent/child/file"],
+                       inos["./parent/child/grandchild"]]
+
+        for ino in tree:
+            del target_inos[target_inos.index(ino['ino'])] # don't catch!
+            
+        assert(len(target_inos) == 0)
+
+    def test_random(self):
+        random.seed(0)
+
+        self.populate()
+        inos = self.get_paths_to_ino()
+        target = random.sample(inos.keys(), 1)[0]
+
+        if target != "./":
+            target = os.path.dirname(target)
+
+        subtree = [path for path in inos.keys() if path.startswith(target)]
+        target_inos = [inos[path] for path in subtree]
+        tree = self.fs.mds_asok(["dump", "tree", target[1:]])
+
+        for ino in tree:
+            del target_inos[target_inos.index(ino['ino'])] # don't catch!
+            
+        assert(len(target_inos) == 0)
+
+        target_depth = target.count('/')
+        maxdepth = max([path.count('/') for path in subtree]) - target_depth
+        depth = random.randint(0, maxdepth)
+        target_inos = [inos[path] for path in subtree \
+                       if path.count('/') <= depth + target_depth]
+        tree = self.fs.mds_asok(["dump", "tree", target[1:], str(depth)])
+
+        for ino in tree:
+            del target_inos[target_inos.index(ino['ino'])] # don't catch!
+            
+        assert(len(target_inos) == 0)
diff --git a/qa/tasks/cephfs/test_exports.py b/qa/tasks/cephfs/test_exports.py
new file mode 100644
index 00000000..abaf92e6
--- /dev/null
+++ b/qa/tasks/cephfs/test_exports.py
@@ -0,0 +1,176 @@
+import logging
+import time
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+class TestExports(CephFSTestCase):
+    MDSS_REQUIRED = 2
+    CLIENTS_REQUIRED = 2
+
+    def test_export_pin(self):
+        self.fs.set_max_mds(2)
+        self.fs.wait_for_daemons()
+
+        status = self.fs.status()
+
+        self.mount_a.run_shell(["mkdir", "-p", "1/2/3"])
+        self._wait_subtrees(status, 0, [])
+
+        # NOP
+        self.mount_a.setfattr("1", "ceph.dir.pin", "-1")
+        self._wait_subtrees(status, 0, [])
+
+        # NOP (rank < -1)
+        self.mount_a.setfattr("1", "ceph.dir.pin", "-2341")
+        self._wait_subtrees(status, 0, [])
+
+        # pin /1 to rank 1
+        self.mount_a.setfattr("1", "ceph.dir.pin", "1")
+        self._wait_subtrees(status, 1, [('/1', 1)])
+
+        # Check export_targets is set properly
+        status = self.fs.status()
+        log.info(status)
+        r0 = status.get_rank(self.fs.id, 0)
+        self.assertTrue(sorted(r0['export_targets']) == [1])
+
+        # redundant pin /1/2 to rank 1
+        self.mount_a.setfattr("1/2", "ceph.dir.pin", "1")
+        self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 1)])
+
+        # change pin /1/2 to rank 0
+        self.mount_a.setfattr("1/2", "ceph.dir.pin", "0")
+        self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 0)])
+        self._wait_subtrees(status, 0, [('/1', 1), ('/1/2', 0)])
+
+        # change pin /1/2/3 to (presently) non-existent rank 2
+        self.mount_a.setfattr("1/2/3", "ceph.dir.pin", "2")
+        self._wait_subtrees(status, 0, [('/1', 1), ('/1/2', 0)])
+        self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 0)])
+
+        # change pin /1/2 back to rank 1
+        self.mount_a.setfattr("1/2", "ceph.dir.pin", "1")
+        self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 1)])
+
+        # add another directory pinned to 1
+        self.mount_a.run_shell(["mkdir", "-p", "1/4/5"])
+        self.mount_a.setfattr("1/4/5", "ceph.dir.pin", "1")
+        self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 1), ('/1/4/5', 1)])
+
+        # change pin /1 to 0
+        self.mount_a.setfattr("1", "ceph.dir.pin", "0")
+        self._wait_subtrees(status, 0, [('/1', 0), ('/1/2', 1), ('/1/4/5', 1)])
+
+        # change pin /1/2 to default (-1); does the subtree root properly respect it's parent pin?
+        self.mount_a.setfattr("1/2", "ceph.dir.pin", "-1")
+        self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1)])
+
+        if len(list(status.get_standbys())):
+            self.fs.set_max_mds(3)
+            self.fs.wait_for_state('up:active', rank=2)
+            self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1), ('/1/2/3', 2)])
+
+            # Check export_targets is set properly
+            status = self.fs.status()
+            log.info(status)
+            r0 = status.get_rank(self.fs.id, 0)
+            self.assertTrue(sorted(r0['export_targets']) == [1,2])
+            r1 = status.get_rank(self.fs.id, 1)
+            self.assertTrue(sorted(r1['export_targets']) == [0])
+            r2 = status.get_rank(self.fs.id, 2)
+            self.assertTrue(sorted(r2['export_targets']) == [])
+
+        # Test rename
+        self.mount_a.run_shell(["mkdir", "-p", "a/b", "aa/bb"])
+        self.mount_a.setfattr("a", "ceph.dir.pin", "1")
+        self.mount_a.setfattr("aa/bb", "ceph.dir.pin", "0")
+        if (len(self.fs.get_active_names()) > 2):
+            self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1), ('/1/2/3', 2), ('/a', 1), ('/aa/bb', 0)])
+        else:
+            self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1), ('/a', 1), ('/aa/bb', 0)])
+        self.mount_a.run_shell(["mv", "aa", "a/b/"])
+        if (len(self.fs.get_active_names()) > 2):
+            self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1), ('/1/2/3', 2), ('/a', 1), ('/a/b/aa/bb', 0)])
+        else:
+            self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1), ('/a', 1), ('/a/b/aa/bb', 0)])
+
+    def test_export_pin_getfattr(self):
+        self.fs.set_max_mds(2)
+        self.fs.wait_for_daemons()
+
+        status = self.fs.status()
+
+        self.mount_a.run_shell(["mkdir", "-p", "1/2/3"])
+        self._wait_subtrees(status, 0, [])
+
+        # pin /1 to rank 0
+        self.mount_a.setfattr("1", "ceph.dir.pin", "1")
+        self._wait_subtrees(status, 1, [('/1', 1)])
+
+        # pin /1/2 to rank 1
+        self.mount_a.setfattr("1/2", "ceph.dir.pin", "1")
+        self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 1)])
+
+        # change pin /1/2 to rank 0
+        self.mount_a.setfattr("1/2", "ceph.dir.pin", "0")
+        self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 0)])
+        self._wait_subtrees(status, 0, [('/1', 1), ('/1/2', 0)])
+
+         # change pin /1/2/3 to (presently) non-existent rank 2
+        self.mount_a.setfattr("1/2/3", "ceph.dir.pin", "2")
+        self._wait_subtrees(status, 0, [('/1', 1), ('/1/2', 0)])
+
+        if len(list(status.get_standbys())):
+            self.fs.set_max_mds(3)
+            self.fs.wait_for_state('up:active', rank=2)
+            self._wait_subtrees(status, 0, [('/1', 1), ('/1/2', 0), ('/1/2/3', 2)])
+
+        if not isinstance(self.mount_a, FuseMount):
+            p = self.mount_a.client_remote.sh('uname -r', wait=True)
+            dir_pin = self.mount_a.getfattr("1", "ceph.dir.pin")
+            log.debug("mount.getfattr('1','ceph.dir.pin'): %s " % dir_pin)
+            if str(p) < "5" and not(dir_pin):
+                self.skipTest("Kernel does not support getting the extended attribute ceph.dir.pin")
+        self.assertEqual(self.mount_a.getfattr("1", "ceph.dir.pin"), '1')
+        self.assertEqual(self.mount_a.getfattr("1/2", "ceph.dir.pin"), '0')
+        if (len(self.fs.get_active_names()) > 2):
+            self.assertEqual(self.mount_a.getfattr("1/2/3", "ceph.dir.pin"), '2')
+
+    def test_session_race(self):
+        """
+        Test session creation race.
+
+        See: https://tracker.ceph.com/issues/24072#change-113056
+        """
+
+        self.fs.set_max_mds(2)
+        status = self.fs.wait_for_daemons()
+
+        rank1 = self.fs.get_rank(rank=1, status=status)
+
+        # Create a directory that is pre-exported to rank 1
+        self.mount_a.run_shell(["mkdir", "-p", "a/aa"])
+        self.mount_a.setfattr("a", "ceph.dir.pin", "1")
+        self._wait_subtrees(status, 1, [('/a', 1)])
+
+        # Now set the mds config to allow the race
+        self.fs.rank_asok(["config", "set", "mds_inject_migrator_session_race", "true"], rank=1)
+
+        # Now create another directory and try to export it
+        self.mount_b.run_shell(["mkdir", "-p", "b/bb"])
+        self.mount_b.setfattr("b", "ceph.dir.pin", "1")
+
+        time.sleep(5)
+
+        # Now turn off the race so that it doesn't wait again
+        self.fs.rank_asok(["config", "set", "mds_inject_migrator_session_race", "false"], rank=1)
+
+        # Now try to create a session with rank 1 by accessing a dir known to
+        # be there, if buggy, this should cause the rank 1 to crash:
+        self.mount_b.run_shell(["ls", "a"])
+
+        # Check if rank1 changed (standby tookover?)
+        new_rank1 = self.fs.get_rank(rank=1)
+        self.assertEqual(rank1['gid'], new_rank1['gid'])
diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py
new file mode 100644
index 00000000..c87afbf6
--- /dev/null
+++ b/qa/tasks/cephfs/test_failover.py
@@ -0,0 +1,638 @@
+import time
+import signal
+import logging
+from unittest import case, SkipTest
+from random import randint
+from six.moves import range
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.exceptions import CommandFailedError
+from tasks.cephfs.fuse_mount import FuseMount
+
+log = logging.getLogger(__name__)
+
+
+class TestClusterResize(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 3
+
+    def grow(self, n):
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+        fscid = self.fs.id
+        status = self.fs.status()
+        log.info("status = {0}".format(status))
+
+        original_ranks = set([info['gid'] for info in status.get_ranks(fscid)])
+        _ = set([info['gid'] for info in status.get_standbys()])
+
+        oldmax = self.fs.get_var('max_mds')
+        self.assertTrue(n > oldmax)
+        self.fs.set_max_mds(n)
+
+        log.info("Waiting for cluster to grow.")
+        status = self.fs.wait_for_daemons(timeout=60+grace*2)
+        ranks = set([info['gid'] for info in status.get_ranks(fscid)])
+        self.assertTrue(original_ranks.issubset(ranks) and len(ranks) == n)
+        return status
+
+    def shrink(self, n):
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+        fscid = self.fs.id
+        status = self.fs.status()
+        log.info("status = {0}".format(status))
+
+        original_ranks = set([info['gid'] for info in status.get_ranks(fscid)])
+        _ = set([info['gid'] for info in status.get_standbys()])
+
+        oldmax = self.fs.get_var('max_mds')
+        self.assertTrue(n < oldmax)
+        self.fs.set_max_mds(n)
+
+        # Wait until the monitor finishes stopping ranks >= n
+        log.info("Waiting for cluster to shink.")
+        status = self.fs.wait_for_daemons(timeout=60+grace*2)
+        ranks = set([info['gid'] for info in status.get_ranks(fscid)])
+        self.assertTrue(ranks.issubset(original_ranks) and len(ranks) == n)
+        return status
+
+
+    def test_grow(self):
+        """
+        That the MDS cluster grows after increasing max_mds.
+        """
+
+        # Need all my standbys up as well as the active daemons
+        # self.wait_for_daemon_start() necessary?
+
+        self.grow(2)
+        self.grow(3)
+
+
+    def test_shrink(self):
+        """
+        That the MDS cluster shrinks automatically after decreasing max_mds.
+        """
+
+        self.grow(3)
+        self.shrink(1)
+
+    def test_up_less_than_max(self):
+        """
+        That a health warning is generated when max_mds is greater than active count.
+        """
+
+        status = self.fs.status()
+        mdss = [info['gid'] for info in status.get_all()]
+        self.fs.set_max_mds(len(mdss)+1)
+        self.wait_for_health("MDS_UP_LESS_THAN_MAX", 30)
+        self.shrink(2)
+        self.wait_for_health_clear(30)
+
+    def test_down_health(self):
+        """
+        That marking a FS down does not generate a health warning
+        """
+
+        self.mount_a.umount_wait()
+
+        self.fs.set_down()
+        try:
+            self.wait_for_health("", 30)
+            raise RuntimeError("got health warning?")
+        except RuntimeError as e:
+            if "Timed out after" in str(e):
+                pass
+            else:
+                raise
+
+    def test_down_twice(self):
+        """
+        That marking a FS down twice does not wipe old_max_mds.
+        """
+
+        self.mount_a.umount_wait()
+
+        self.grow(2)
+        self.fs.set_down()
+        self.fs.wait_for_daemons()
+        self.fs.set_down(False)
+        self.assertEqual(self.fs.get_var("max_mds"), 2)
+        self.fs.wait_for_daemons(timeout=60)
+
+    def test_down_grow(self):
+        """
+        That setting max_mds undoes down.
+        """
+
+        self.mount_a.umount_wait()
+
+        self.fs.set_down()
+        self.fs.wait_for_daemons()
+        self.grow(2)
+        self.fs.wait_for_daemons()
+
+    def test_down(self):
+        """
+        That down setting toggles and sets max_mds appropriately.
+        """
+
+        self.mount_a.umount_wait()
+
+        self.fs.set_down()
+        self.fs.wait_for_daemons()
+        self.assertEqual(self.fs.get_var("max_mds"), 0)
+        self.fs.set_down(False)
+        self.assertEqual(self.fs.get_var("max_mds"), 1)
+        self.fs.wait_for_daemons()
+        self.assertEqual(self.fs.get_var("max_mds"), 1)
+
+    def test_hole(self):
+        """
+        Test that a hole cannot be created in the FS ranks.
+        """
+
+        fscid = self.fs.id
+
+        self.grow(2)
+
+        self.fs.set_max_mds(1)
+        log.info("status = {0}".format(self.fs.status()))
+
+        self.fs.set_max_mds(3)
+        # Don't wait for rank 1 to stop
+
+        self.fs.set_max_mds(2)
+        # Prevent another MDS from taking rank 1
+        # XXX This is a little racy because rank 1 may have stopped and a
+        #     standby assigned to rank 1 before joinable=0 is set.
+        self.fs.set_joinable(False) # XXX keep in mind changing max_mds clears this flag
+
+        try:
+            status = self.fs.wait_for_daemons(timeout=90)
+            raise RuntimeError("should not be able to successfully shrink cluster!")
+        except:
+            # could not shrink to max_mds=2 and reach 2 actives (because joinable=False)
+            status = self.fs.status()
+            ranks = set([info['rank'] for info in status.get_ranks(fscid)])
+            self.assertTrue(ranks == set([0]))
+        finally:
+            log.info("status = {0}".format(status))
+
+    def test_thrash(self):
+        """
+        Test that thrashing max_mds does not fail.
+        """
+
+        max_mds = 2
+        for i in range(0, 100):
+            self.fs.set_max_mds(max_mds)
+            max_mds = (max_mds+1)%3+1
+
+        self.fs.wait_for_daemons(timeout=90)
+
+class TestFailover(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 2
+
+    def test_simple(self):
+        """
+        That when the active MDS is killed, a standby MDS is promoted into
+        its rank after the grace period.
+
+        This is just a simple unit test, the harder cases are covered
+        in thrashing tests.
+        """
+
+        # Need all my standbys up as well as the active daemons
+        self.wait_for_daemon_start()
+
+        (original_active, ) = self.fs.get_active_names()
+        original_standbys = self.mds_cluster.get_standby_daemons()
+
+        # Kill the rank 0 daemon's physical process
+        self.fs.mds_stop(original_active)
+
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+        # Wait until the monitor promotes his replacement
+        def promoted():
+            active = self.fs.get_active_names()
+            return active and active[0] in original_standbys
+
+        log.info("Waiting for promotion of one of the original standbys {0}".format(
+            original_standbys))
+        self.wait_until_true(
+            promoted,
+            timeout=grace*2)
+
+        # Start the original rank 0 daemon up again, see that he becomes a standby
+        self.fs.mds_restart(original_active)
+        self.wait_until_true(
+            lambda: original_active in self.mds_cluster.get_standby_daemons(),
+            timeout=60  # Approximately long enough for MDS to start and mon to notice
+        )
+
+    def test_client_abort(self):
+        """
+        That a client will respect fuse_require_active_mds and error out
+        when the cluster appears to be unavailable.
+        """
+
+        if not isinstance(self.mount_a, FuseMount):
+            raise SkipTest("Requires FUSE client to inject client metadata")
+
+        require_active = self.fs.get_config("fuse_require_active_mds", service_type="mon").lower() == "true"
+        if not require_active:
+            raise case.SkipTest("fuse_require_active_mds is not set")
+
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+        # Check it's not laggy to begin with
+        (original_active, ) = self.fs.get_active_names()
+        self.assertNotIn("laggy_since", self.fs.status().get_mds(original_active))
+
+        self.mounts[0].umount_wait()
+
+        # Control: that we can mount and unmount usually, while the cluster is healthy
+        self.mounts[0].mount()
+        self.mounts[0].wait_until_mounted()
+        self.mounts[0].umount_wait()
+
+        # Stop the daemon processes
+        self.fs.mds_stop()
+
+        # Wait for everyone to go laggy
+        def laggy():
+            mdsmap = self.fs.get_mds_map()
+            for info in mdsmap['info'].values():
+                if "laggy_since" not in info:
+                    return False
+
+            return True
+
+        self.wait_until_true(laggy, grace * 2)
+        with self.assertRaises(CommandFailedError):
+            self.mounts[0].mount()
+
+    def test_standby_count_wanted(self):
+        """
+        That cluster health warnings are generated by insufficient standbys available.
+        """
+
+        # Need all my standbys up as well as the active daemons
+        self.wait_for_daemon_start()
+
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+        standbys = self.mds_cluster.get_standby_daemons()
+        self.assertGreaterEqual(len(standbys), 1)
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)))
+
+        # Kill a standby and check for warning
+        victim = standbys.pop()
+        self.fs.mds_stop(victim)
+        log.info("waiting for insufficient standby daemon warning")
+        self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
+
+        # restart the standby, see that he becomes a standby, check health clears
+        self.fs.mds_restart(victim)
+        self.wait_until_true(
+            lambda: victim in self.mds_cluster.get_standby_daemons(),
+            timeout=60  # Approximately long enough for MDS to start and mon to notice
+        )
+        self.wait_for_health_clear(timeout=30)
+
+        # Set it one greater than standbys ever seen
+        standbys = self.mds_cluster.get_standby_daemons()
+        self.assertGreaterEqual(len(standbys), 1)
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)+1))
+        log.info("waiting for insufficient standby daemon warning")
+        self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
+
+        # Set it to 0
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', '0')
+        self.wait_for_health_clear(timeout=30)
+
+    def test_discontinuous_mdsmap(self):
+        """
+        That discontinuous mdsmap does not affect failover.
+        See http://tracker.ceph.com/issues/24856.
+        """
+        self.fs.set_max_mds(2)
+        status = self.fs.wait_for_daemons()
+
+        self.mount_a.umount_wait()
+
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+        monc_timeout = float(self.fs.get_config("mon_client_ping_timeout", service_type="mds"))
+
+        mds_0 = self.fs.get_rank(rank=0, status=status)
+        self.fs.rank_freeze(True, rank=0) # prevent failover
+        self.fs.rank_signal(signal.SIGSTOP, rank=0, status=status)
+        self.wait_until_true(
+            lambda: "laggy_since" in self.fs.get_rank(),
+            timeout=grace * 2
+        )
+
+        self.fs.rank_fail(rank=1)
+        self.fs.wait_for_state('up:resolve', rank=1, timeout=30)
+
+        # Make sure of mds_0's monitor connection gets reset
+        time.sleep(monc_timeout * 2)
+
+        # Continue rank 0, it will get discontinuous mdsmap
+        self.fs.rank_signal(signal.SIGCONT, rank=0)
+        self.wait_until_true(
+            lambda: "laggy_since" not in self.fs.get_rank(rank=0),
+            timeout=grace * 2
+        )
+
+        # mds.b will be stuck at 'reconnect' state if snapserver gets confused
+        # by discontinuous mdsmap
+        self.fs.wait_for_state('up:active', rank=1, timeout=30)
+        self.assertEqual(mds_0['gid'], self.fs.get_rank(rank=0)['gid'])
+        self.fs.rank_freeze(False, rank=0)
+
+class TestStandbyReplay(CephFSTestCase):
+    MDSS_REQUIRED = 4
+
+    def _confirm_no_replay(self):
+        status = self.fs.status()
+        _ = len(list(status.get_standbys()))
+        self.assertEqual(0, len(list(self.fs.get_replays(status=status))))
+        return status
+
+    def _confirm_single_replay(self, full=True, status=None, retries=3):
+        status = self.fs.wait_for_daemons(status=status)
+        ranks = sorted(self.fs.get_mds_map(status=status)['in'])
+        replays = list(self.fs.get_replays(status=status))
+        checked_replays = set()
+        for rank in ranks:
+            has_replay = False
+            for replay in replays:
+                if replay['rank'] == rank:
+                    self.assertFalse(has_replay)
+                    has_replay = True
+                    checked_replays.add(replay['gid'])
+            if full and not has_replay:
+                if retries <= 0:
+                    raise RuntimeError("rank "+str(rank)+" has no standby-replay follower")
+                else:
+                    retries = retries-1
+                    time.sleep(2)
+        self.assertEqual(checked_replays, set(info['gid'] for info in replays))
+        return status
+
+    def _check_replay_takeover(self, status, rank=0):
+        replay = self.fs.get_replay(rank=rank, status=status)
+        new_status = self.fs.wait_for_daemons()
+        new_active = self.fs.get_rank(rank=rank, status=new_status)
+        if replay:
+            self.assertEqual(replay['gid'], new_active['gid'])
+        else:
+            # double check takeover came from a standby (or some new daemon via restart)
+            found = False
+            for info in status.get_standbys():
+                if info['gid'] == new_active['gid']:
+                    found = True
+                    break
+            if not found:
+                for info in status.get_all():
+                    self.assertNotEqual(info['gid'], new_active['gid'])
+        return new_status
+
+    def test_standby_replay_singleton(self):
+        """
+        That only one MDS becomes standby-replay.
+        """
+
+        self._confirm_no_replay()
+        self.fs.set_allow_standby_replay(True)
+        time.sleep(30)
+        self._confirm_single_replay()
+
+    def test_standby_replay_singleton_fail(self):
+        """
+        That failures don't violate singleton constraint.
+        """
+
+        self._confirm_no_replay()
+        self.fs.set_allow_standby_replay(True)
+        status = self._confirm_single_replay()
+
+        for i in range(10):
+            time.sleep(randint(1, 5))
+            self.fs.rank_restart(status=status)
+            status = self._check_replay_takeover(status)
+            status = self._confirm_single_replay(status=status)
+
+        for i in range(10):
+            time.sleep(randint(1, 5))
+            self.fs.rank_fail()
+            status = self._check_replay_takeover(status)
+            status = self._confirm_single_replay(status=status)
+
+    def test_standby_replay_singleton_fail_multimds(self):
+        """
+        That failures don't violate singleton constraint with multiple actives.
+        """
+
+        status = self._confirm_no_replay()
+        new_max_mds = randint(2, len(list(status.get_standbys())))
+        self.fs.set_max_mds(new_max_mds)
+        self.fs.wait_for_daemons() # wait for actives to come online!
+        self.fs.set_allow_standby_replay(True)
+        status = self._confirm_single_replay(full=False)
+
+        for i in range(10):
+            time.sleep(randint(1, 5))
+            victim = randint(0, new_max_mds-1)
+            self.fs.rank_restart(rank=victim, status=status)
+            status = self._check_replay_takeover(status, rank=victim)
+            status = self._confirm_single_replay(status=status, full=False)
+
+        for i in range(10):
+            time.sleep(randint(1, 5))
+            victim = randint(0, new_max_mds-1)
+            self.fs.rank_fail(rank=victim)
+            status = self._check_replay_takeover(status, rank=victim)
+            status = self._confirm_single_replay(status=status, full=False)
+
+    def test_standby_replay_failure(self):
+        """
+        That the failure of a standby-replay daemon happens cleanly
+        and doesn't interrupt anything else.
+        """
+
+        status = self._confirm_no_replay()
+        self.fs.set_max_mds(1)
+        self.fs.set_allow_standby_replay(True)
+        status = self._confirm_single_replay()
+
+        for i in range(10):
+            time.sleep(randint(1, 5))
+            victim = self.fs.get_replay(status=status)
+            self.fs.mds_restart(mds_id=victim['name'])
+            status = self._confirm_single_replay(status=status)
+
+    def test_rank_stopped(self):
+        """
+        That when a rank is STOPPED, standby replays for
+        that rank get torn down
+        """
+
+        status = self._confirm_no_replay()
+        standby_count = len(list(status.get_standbys()))
+        self.fs.set_max_mds(2)
+        self.fs.set_allow_standby_replay(True)
+        status = self._confirm_single_replay()
+
+        self.fs.set_max_mds(1) # stop rank 1
+
+        status = self._confirm_single_replay()
+        self.assertTrue(standby_count, len(list(status.get_standbys())))
+
+
+class TestMultiFilesystems(CephFSTestCase):
+    CLIENTS_REQUIRED = 2
+    MDSS_REQUIRED = 4
+
+    # We'll create our own filesystems and start our own daemons
+    REQUIRE_FILESYSTEM = False
+
+    def setUp(self):
+        super(TestMultiFilesystems, self).setUp()
+        self.mds_cluster.mon_manager.raw_cluster_cmd("fs", "flag", "set",
+            "enable_multiple", "true",
+            "--yes-i-really-mean-it")
+
+    def _setup_two(self):
+        fs_a = self.mds_cluster.newfs("alpha")
+        fs_b = self.mds_cluster.newfs("bravo")
+
+        self.mds_cluster.mds_restart()
+
+        # Wait for both filesystems to go healthy
+        fs_a.wait_for_daemons()
+        fs_b.wait_for_daemons()
+
+        # Reconfigure client auth caps
+        for mount in self.mounts:
+            self.mds_cluster.mon_manager.raw_cluster_cmd_result(
+                'auth', 'caps', "client.{0}".format(mount.client_id),
+                'mds', 'allow',
+                'mon', 'allow r',
+                'osd', 'allow rw pool={0}, allow rw pool={1}'.format(
+                    fs_a.get_data_pool_name(), fs_b.get_data_pool_name()))
+
+        return fs_a, fs_b
+
+    def test_clients(self):
+        fs_a, fs_b = self._setup_two()
+
+        # Mount a client on fs_a
+        self.mount_a.mount(mount_fs_name=fs_a.name)
+        self.mount_a.write_n_mb("pad.bin", 1)
+        self.mount_a.write_n_mb("test.bin", 2)
+        a_created_ino = self.mount_a.path_to_ino("test.bin")
+        self.mount_a.create_files()
+
+        # Mount a client on fs_b
+        self.mount_b.mount(mount_fs_name=fs_b.name)
+        self.mount_b.write_n_mb("test.bin", 1)
+        b_created_ino = self.mount_b.path_to_ino("test.bin")
+        self.mount_b.create_files()
+
+        # Check that a non-default filesystem mount survives an MDS
+        # failover (i.e. that map subscription is continuous, not
+        # just the first time), reproduces #16022
+        old_fs_b_mds = fs_b.get_active_names()[0]
+        self.mds_cluster.mds_stop(old_fs_b_mds)
+        self.mds_cluster.mds_fail(old_fs_b_mds)
+        fs_b.wait_for_daemons()
+        background = self.mount_b.write_background()
+        # Raise exception if the write doesn't finish (i.e. if client
+        # has not kept up with MDS failure)
+        try:
+            self.wait_until_true(lambda: background.finished, timeout=30)
+        except RuntimeError:
+            # The mount is stuck, we'll have to force it to fail cleanly
+            background.stdin.close()
+            self.mount_b.umount_wait(force=True)
+            raise
+
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        # See that the client's files went into the correct pool
+        self.assertTrue(fs_a.data_objects_present(a_created_ino, 1024 * 1024))
+        self.assertTrue(fs_b.data_objects_present(b_created_ino, 1024 * 1024))
+
+    def test_standby(self):
+        fs_a, fs_b = self._setup_two()
+
+        # Assert that the remaining two MDS daemons are now standbys
+        a_daemons = fs_a.get_active_names()
+        b_daemons = fs_b.get_active_names()
+        self.assertEqual(len(a_daemons), 1)
+        self.assertEqual(len(b_daemons), 1)
+        original_a = a_daemons[0]
+        original_b = b_daemons[0]
+        expect_standby_daemons = set(self.mds_cluster.mds_ids) - (set(a_daemons) | set(b_daemons))
+
+        # Need all my standbys up as well as the active daemons
+        self.wait_for_daemon_start()
+        self.assertEqual(expect_standby_daemons, self.mds_cluster.get_standby_daemons())
+
+        # Kill fs_a's active MDS, see a standby take over
+        self.mds_cluster.mds_stop(original_a)
+        self.mds_cluster.mon_manager.raw_cluster_cmd("mds", "fail", original_a)
+        self.wait_until_equal(lambda: len(fs_a.get_active_names()), 1, 30,
+                              reject_fn=lambda v: v > 1)
+        # Assert that it's a *different* daemon that has now appeared in the map for fs_a
+        self.assertNotEqual(fs_a.get_active_names()[0], original_a)
+
+        # Kill fs_b's active MDS, see a standby take over
+        self.mds_cluster.mds_stop(original_b)
+        self.mds_cluster.mon_manager.raw_cluster_cmd("mds", "fail", original_b)
+        self.wait_until_equal(lambda: len(fs_b.get_active_names()), 1, 30,
+                              reject_fn=lambda v: v > 1)
+        # Assert that it's a *different* daemon that has now appeared in the map for fs_a
+        self.assertNotEqual(fs_b.get_active_names()[0], original_b)
+
+        # Both of the original active daemons should be gone, and all standbys used up
+        self.assertEqual(self.mds_cluster.get_standby_daemons(), set())
+
+        # Restart the ones I killed, see them reappear as standbys
+        self.mds_cluster.mds_restart(original_a)
+        self.mds_cluster.mds_restart(original_b)
+        self.wait_until_true(
+            lambda: {original_a, original_b} == self.mds_cluster.get_standby_daemons(),
+            timeout=30
+        )
+
+    def test_grow_shrink(self):
+        # Usual setup...
+        fs_a, fs_b = self._setup_two()
+
+        # Increase max_mds on fs_b, see a standby take up the role
+        fs_b.set_max_mds(2)
+        self.wait_until_equal(lambda: len(fs_b.get_active_names()), 2, 30,
+                              reject_fn=lambda v: v > 2 or v < 1)
+
+        # Increase max_mds on fs_a, see a standby take up the role
+        fs_a.set_max_mds(2)
+        self.wait_until_equal(lambda: len(fs_a.get_active_names()), 2, 30,
+                              reject_fn=lambda v: v > 2 or v < 1)
+
+        # Shrink fs_b back to 1, see a daemon go back to standby
+        fs_b.set_max_mds(1)
+        self.wait_until_equal(lambda: len(fs_b.get_active_names()), 1, 30,
+                              reject_fn=lambda v: v > 2 or v < 1)
+
+        # Grow fs_a up to 3, see the former fs_b daemon join it.
+        fs_a.set_max_mds(3)
+        self.wait_until_equal(lambda: len(fs_a.get_active_names()), 3, 60,
+                              reject_fn=lambda v: v > 3 or v < 2)
diff --git a/qa/tasks/cephfs/test_flush.py b/qa/tasks/cephfs/test_flush.py
new file mode 100644
index 00000000..ee0b1c92
--- /dev/null
+++ b/qa/tasks/cephfs/test_flush.py
@@ -0,0 +1,113 @@
+
+from textwrap import dedent
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO
+
+
+class TestFlush(CephFSTestCase):
+    def test_flush(self):
+        self.mount_a.run_shell(["mkdir", "mydir"])
+        self.mount_a.run_shell(["touch", "mydir/alpha"])
+        dir_ino = self.mount_a.path_to_ino("mydir")
+        file_ino = self.mount_a.path_to_ino("mydir/alpha")
+
+        # Unmount the client so that it isn't still holding caps
+        self.mount_a.umount_wait()
+
+        # Before flush, the dirfrag object does not exist
+        with self.assertRaises(ObjectNotFound):
+            self.fs.list_dirfrag(dir_ino)
+
+        # Before flush, the file's backtrace has not been written
+        with self.assertRaises(ObjectNotFound):
+            self.fs.read_backtrace(file_ino)
+
+        # Before flush, there are no dentries in the root
+        self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
+
+        # Execute flush
+        flush_data = self.fs.mds_asok(["flush", "journal"])
+        self.assertEqual(flush_data['return_code'], 0)
+
+        # After flush, the dirfrag object has been created
+        dir_list = self.fs.list_dirfrag(dir_ino)
+        self.assertEqual(dir_list, ["alpha_head"])
+
+        # And the 'mydir' dentry is in the root
+        self.assertEqual(self.fs.list_dirfrag(ROOT_INO), ['mydir_head'])
+
+        # ...and the data object has its backtrace
+        backtrace = self.fs.read_backtrace(file_ino)
+        self.assertEqual(['alpha', 'mydir'], [a['dname'] for a in backtrace['ancestors']])
+        self.assertEqual([dir_ino, 1], [a['dirino'] for a in backtrace['ancestors']])
+        self.assertEqual(file_ino, backtrace['ino'])
+
+        # ...and the journal is truncated to just a single subtreemap from the
+        # newly created segment
+        summary_output = self.fs.journal_tool(["event", "get", "summary"], 0)
+        try:
+            self.assertEqual(summary_output,
+                             dedent(
+                                 """
+                                 Events by type:
+                                   SUBTREEMAP: 1
+                                 Errors: 0
+                                 """
+                             ).strip())
+        except AssertionError:
+            # In some states, flushing the journal will leave you
+            # an extra event from locks a client held.   This is
+            # correct behaviour: the MDS is flushing the journal,
+            # it's just that new events are getting added too.
+            # In this case, we should nevertheless see a fully
+            # empty journal after a second flush.
+            self.assertEqual(summary_output,
+                             dedent(
+                                 """
+                                 Events by type:
+                                   SUBTREEMAP: 1
+                                   UPDATE: 1
+                                 Errors: 0
+                                 """
+                             ).strip())
+            flush_data = self.fs.mds_asok(["flush", "journal"])
+            self.assertEqual(flush_data['return_code'], 0)
+            self.assertEqual(self.fs.journal_tool(["event", "get", "summary"], 0),
+                             dedent(
+                                 """
+                                 Events by type:
+                                   SUBTREEMAP: 1
+                                 Errors: 0
+                                 """
+                             ).strip())
+
+        # Now for deletion!
+        # We will count the RADOS deletions and MDS file purges, to verify that
+        # the expected behaviour is happening as a result of the purge
+        initial_dels = self.fs.mds_asok(['perf', 'dump', 'objecter'])['objecter']['osdop_delete']
+        initial_purges = self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']['strays_enqueued']
+
+        # Use a client to delete a file
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        self.mount_a.run_shell(["rm", "-rf", "mydir"])
+
+        # Flush the journal so that the directory inode can be purged
+        flush_data = self.fs.mds_asok(["flush", "journal"])
+        self.assertEqual(flush_data['return_code'], 0)
+
+        # We expect to see a single file purge
+        self.wait_until_true(
+            lambda: self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']['strays_enqueued'] - initial_purges >= 2,
+            60)
+
+        # We expect two deletions, one of the dirfrag and one of the backtrace
+        self.wait_until_true(
+            lambda: self.fs.mds_asok(['perf', 'dump', 'objecter'])['objecter']['osdop_delete'] - initial_dels >= 2,
+            60)  # timeout is fairly long to allow for tick+rados latencies
+
+        with self.assertRaises(ObjectNotFound):
+            self.fs.list_dirfrag(dir_ino)
+        with self.assertRaises(ObjectNotFound):
+            self.fs.read_backtrace(file_ino)
+        self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
diff --git a/qa/tasks/cephfs/test_forward_scrub.py b/qa/tasks/cephfs/test_forward_scrub.py
new file mode 100644
index 00000000..cc861b38
--- /dev/null
+++ b/qa/tasks/cephfs/test_forward_scrub.py
@@ -0,0 +1,298 @@
+
+"""
+Test that the forward scrub functionality can traverse metadata and apply
+requested tags, on well formed metadata.
+
+This is *not* the real testing for forward scrub, which will need to test
+how the functionality responds to damaged metadata.
+
+"""
+import json
+
+import logging
+import six
+
+from collections import namedtuple
+from io import BytesIO
+from textwrap import dedent
+
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+import struct
+
+log = logging.getLogger(__name__)
+
+
+ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
+
+
+class TestForwardScrub(CephFSTestCase):
+    MDSS_REQUIRED = 1
+
+    def _read_str_xattr(self, pool, obj, attr):
+        """
+        Read a ceph-encoded string from a rados xattr
+        """
+        output = self.fs.rados(["getxattr", obj, attr], pool=pool,
+                               stdout_data=BytesIO())
+        strlen = struct.unpack('i', output[0:4])[0]
+        return six.ensure_str(output[4:(4 + strlen)], encoding='ascii')
+
+    def _get_paths_to_ino(self):
+        inos = {}
+        p = self.mount_a.run_shell(["find", "./"])
+        paths = p.stdout.getvalue().strip().split()
+        for path in paths:
+            inos[path] = self.mount_a.path_to_ino(path)
+
+        return inos
+
+    def test_apply_tag(self):
+        self.mount_a.run_shell(["mkdir", "parentdir"])
+        self.mount_a.run_shell(["mkdir", "parentdir/childdir"])
+        self.mount_a.run_shell(["touch", "rfile"])
+        self.mount_a.run_shell(["touch", "parentdir/pfile"])
+        self.mount_a.run_shell(["touch", "parentdir/childdir/cfile"])
+
+        # Build a structure mapping path to inode, as we will later want
+        # to check object by object and objects are named after ino number
+        inos = self._get_paths_to_ino()
+
+        # Flush metadata: this is a friendly test of forward scrub so we're skipping
+        # the part where it's meant to cope with dirty metadata
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(["flush", "journal"])
+
+        tag = "mytag"
+
+        # Execute tagging forward scrub
+        self.fs.mds_asok(["tag", "path", "/parentdir", tag])
+        # Wait for completion
+        import time
+        time.sleep(10)
+        # FIXME watching clog isn't a nice mechanism for this, once we have a ScrubMap we'll
+        # watch that instead
+
+        # Check that dirs were tagged
+        for dirpath in ["./parentdir", "./parentdir/childdir"]:
+            self.assertTagged(inos[dirpath], tag, self.fs.get_metadata_pool_name())
+
+        # Check that files were tagged
+        for filepath in ["./parentdir/pfile", "./parentdir/childdir/cfile"]:
+            self.assertTagged(inos[filepath], tag, self.fs.get_data_pool_name())
+
+        # This guy wasn't in the tag path, shouldn't have been tagged
+        self.assertUntagged(inos["./rfile"])
+
+    def assertUntagged(self, ino):
+        file_obj_name = "{0:x}.00000000".format(ino)
+        with self.assertRaises(CommandFailedError):
+            self._read_str_xattr(
+                self.fs.get_data_pool_name(),
+                file_obj_name,
+                "scrub_tag"
+            )
+
+    def assertTagged(self, ino, tag, pool):
+        file_obj_name = "{0:x}.00000000".format(ino)
+        wrote = self._read_str_xattr(
+            pool,
+            file_obj_name,
+            "scrub_tag"
+        )
+        self.assertEqual(wrote, tag)
+
+    def _validate_linkage(self, expected):
+        inos = self._get_paths_to_ino()
+        try:
+            self.assertDictEqual(inos, expected)
+        except AssertionError:
+            log.error("Expected: {0}".format(json.dumps(expected, indent=2)))
+            log.error("Actual: {0}".format(json.dumps(inos, indent=2)))
+            raise
+
+    def test_orphan_scan(self):
+        # Create some files whose metadata we will flush
+        self.mount_a.run_python(dedent("""
+            import os
+            mount_point = "{mount_point}"
+            parent = os.path.join(mount_point, "parent")
+            os.mkdir(parent)
+            flushed = os.path.join(parent, "flushed")
+            os.mkdir(flushed)
+            for f in ["alpha", "bravo", "charlie"]:
+                open(os.path.join(flushed, f), 'w').write(f)
+        """.format(mount_point=self.mount_a.mountpoint)))
+
+        inos = self._get_paths_to_ino()
+
+        # Flush journal
+        # Umount before flush to avoid cap releases putting
+        # things we don't want in the journal later.
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(["flush", "journal"])
+
+        # Create a new inode that's just in the log, i.e. would
+        # look orphaned to backward scan if backward scan wisnae
+        # respectin' tha scrub_tag xattr.
+        self.mount_a.mount()
+        self.mount_a.run_shell(["mkdir", "parent/unflushed"])
+        self.mount_a.run_shell(["dd", "if=/dev/urandom",
+                                "of=./parent/unflushed/jfile",
+                                "bs=1M", "count=8"])
+        inos["./parent/unflushed"] = self.mount_a.path_to_ino("./parent/unflushed")
+        inos["./parent/unflushed/jfile"] = self.mount_a.path_to_ino("./parent/unflushed/jfile")
+        self.mount_a.umount_wait()
+
+        # Orphan an inode by deleting its dentry
+        # Our victim will be.... bravo.
+        self.mount_a.umount_wait()
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+        self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
+        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
+        frag_obj_id = "{0:x}.00000000".format(inos["./parent/flushed"])
+        self.fs.rados(["rmomapkey", frag_obj_id, "bravo_head"])
+
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+
+        # See that the orphaned file is indeed missing from a client's POV
+        self.mount_a.mount()
+        damaged_state = self._get_paths_to_ino()
+        self.assertNotIn("./parent/flushed/bravo", damaged_state)
+        self.mount_a.umount_wait()
+
+        # Run a tagging forward scrub
+        tag = "mytag123"
+        self.fs.mds_asok(["tag", "path", "/parent", tag])
+
+        # See that the orphan wisnae tagged
+        self.assertUntagged(inos['./parent/flushed/bravo'])
+
+        # See that the flushed-metadata-and-still-present files are tagged
+        self.assertTagged(inos['./parent/flushed/alpha'], tag, self.fs.get_data_pool_name())
+        self.assertTagged(inos['./parent/flushed/charlie'], tag, self.fs.get_data_pool_name())
+
+        # See that journalled-but-not-flushed file *was* tagged
+        self.assertTagged(inos['./parent/unflushed/jfile'], tag, self.fs.get_data_pool_name())
+
+        # Run cephfs-data-scan targeting only orphans
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+        self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
+        self.fs.data_scan([
+            "scan_inodes",
+            "--filter-tag", tag,
+            self.fs.get_data_pool_name()
+        ])
+
+        # After in-place injection stats should be kosher again
+        self.fs.set_ceph_conf('mds', 'mds verify scatter', True)
+        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', True)
+
+        # And we should have all the same linkage we started with,
+        # and no lost+found, and no extra inodes!
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+        self.mount_a.mount()
+        self._validate_linkage(inos)
+
+    def _stash_inotable(self):
+        # Get all active ranks
+        ranks = self.fs.get_all_mds_rank()
+
+        inotable_dict = {}
+        for rank in ranks:
+            inotable_oid = "mds{rank:d}_".format(rank=rank) + "inotable"
+            print("Trying to fetch inotable object: " + inotable_oid)
+
+            #self.fs.get_metadata_object("InoTable", "mds0_inotable")
+            inotable_raw = self.fs.get_metadata_object_raw(inotable_oid)
+            inotable_dict[inotable_oid] = inotable_raw
+        return inotable_dict
+
+    def test_inotable_sync(self):
+        self.mount_a.write_n_mb("file1_sixmegs", 6)
+
+        # Flush journal
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(["flush", "journal"])
+
+        inotable_copy = self._stash_inotable()
+
+        self.mount_a.mount()
+
+        self.mount_a.write_n_mb("file2_sixmegs", 6)
+        self.mount_a.write_n_mb("file3_sixmegs", 6)
+
+        inos = self._get_paths_to_ino()
+
+        # Flush journal
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(["flush", "journal"])
+
+        self.mount_a.umount_wait()
+
+        with self.assert_cluster_log("inode table repaired", invert_match=True):
+            out_json = self.fs.rank_tell(["scrub", "start", "/", "repair", "recursive"])
+            self.assertNotEqual(out_json, None)
+
+        self.mds_cluster.mds_stop()
+        self.mds_cluster.mds_fail()
+
+        # Truncate the journal (to ensure the inotable on disk
+        # is all that will be in the InoTable in memory)
+
+        self.fs.journal_tool(["event", "splice",
+                              "--inode={0}".format(inos["./file2_sixmegs"]), "summary"], 0)
+
+        self.fs.journal_tool(["event", "splice",
+                              "--inode={0}".format(inos["./file3_sixmegs"]), "summary"], 0)
+
+        # Revert to old inotable.
+        for key, value in inotable_copy.items():
+           self.fs.put_metadata_object_raw(key, value)
+
+        self.mds_cluster.mds_restart()
+        self.fs.wait_for_daemons()
+
+        with self.assert_cluster_log("inode table repaired"):
+            out_json = self.fs.rank_tell(["scrub", "start", "/", "repair", "recursive"])
+            self.assertNotEqual(out_json, None)
+
+        self.mds_cluster.mds_stop()
+        table_text = self.fs.table_tool(["0", "show", "inode"])
+        table = json.loads(table_text)
+        self.assertGreater(
+                table['0']['data']['inotable']['free'][0]['start'],
+                inos['./file3_sixmegs'])
+
+    def test_backtrace_repair(self):
+        """
+        That the MDS can repair an inodes backtrace in the data pool
+        if it is found to be damaged.
+        """
+        # Create a file for subsequent checks
+        self.mount_a.run_shell(["mkdir", "parent_a"])
+        self.mount_a.run_shell(["touch", "parent_a/alpha"])
+        file_ino = self.mount_a.path_to_ino("parent_a/alpha")
+
+        # That backtrace and layout are written after initial flush
+        self.fs.mds_asok(["flush", "journal"])
+        backtrace = self.fs.read_backtrace(file_ino)
+        self.assertEqual(['alpha', 'parent_a'],
+                         [a['dname'] for a in backtrace['ancestors']])
+
+        # Go corrupt the backtrace
+        self.fs._write_data_xattr(file_ino, "parent",
+                                  "oh i'm sorry did i overwrite your xattr?")
+
+        with self.assert_cluster_log("bad backtrace on inode"):
+            out_json = self.fs.rank_tell(["scrub", "start", "/", "repair", "recursive"])
+            self.assertNotEqual(out_json, None)
+        self.fs.mds_asok(["flush", "journal"])
+        backtrace = self.fs.read_backtrace(file_ino)
+        self.assertEqual(['alpha', 'parent_a'],
+                         [a['dname'] for a in backtrace['ancestors']])
diff --git a/qa/tasks/cephfs/test_fragment.py b/qa/tasks/cephfs/test_fragment.py
new file mode 100644
index 00000000..0ed5da28
--- /dev/null
+++ b/qa/tasks/cephfs/test_fragment.py
@@ -0,0 +1,229 @@
+
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.orchestra import run
+
+import logging
+log = logging.getLogger(__name__)
+
+
+class TestFragmentation(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 1
+
+    def get_splits(self):
+        return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_split']
+
+    def get_merges(self):
+        return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_merge']
+
+    def get_dir_ino(self, path):
+        dir_cache = self.fs.read_cache(path, 0)
+        dir_ino = None
+        dir_inono = self.mount_a.path_to_ino(path.strip("/"))
+        for ino in dir_cache:
+            if ino['ino'] == dir_inono:
+                dir_ino = ino
+                break
+        self.assertIsNotNone(dir_ino)
+        return dir_ino
+
+    def _configure(self, **kwargs):
+        """
+        Apply kwargs as MDS configuration settings, enable dirfrags
+        and restart the MDSs.
+        """
+
+        for k, v in kwargs.items():
+            self.ceph_cluster.set_ceph_conf("mds", k, v.__str__())
+
+        self.mds_cluster.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+    def test_oversize(self):
+        """
+        That a directory is split when it becomes too large.
+        """
+
+        split_size = 20
+        merge_size = 5
+
+        self._configure(
+            mds_bal_split_size=split_size,
+            mds_bal_merge_size=merge_size,
+            mds_bal_split_bits=1
+        )
+
+        self.assertEqual(self.get_splits(), 0)
+
+        self.mount_a.create_n_files("splitdir/file", split_size + 1)
+
+        self.wait_until_true(
+            lambda: self.get_splits() == 1,
+            timeout=30
+        )
+
+        frags = self.get_dir_ino("/splitdir")['dirfrags']
+        self.assertEqual(len(frags), 2)
+        self.assertEqual(frags[0]['dirfrag'], "0x10000000000.0*")
+        self.assertEqual(frags[1]['dirfrag'], "0x10000000000.1*")
+        self.assertEqual(
+            sum([len(f['dentries']) for f in frags]),
+            split_size + 1
+        )
+
+        self.assertEqual(self.get_merges(), 0)
+
+        self.mount_a.run_shell(["rm", "-f", run.Raw("splitdir/file*")])
+
+        self.wait_until_true(
+            lambda: self.get_merges() == 1,
+            timeout=30
+        )
+
+        self.assertEqual(len(self.get_dir_ino("/splitdir")["dirfrags"]), 1)
+
+    def test_rapid_creation(self):
+        """
+        That the fast-splitting limit of 1.5x normal limit is
+        applied when creating dentries quickly.
+        """
+
+        split_size = 100
+        merge_size = 1
+
+        self._configure(
+            mds_bal_split_size=split_size,
+            mds_bal_merge_size=merge_size,
+            mds_bal_split_bits=3,
+            mds_bal_fragment_size_max=int(split_size * 1.5 + 2)
+        )
+
+        # We test this only at a single split level.  If a client was sending
+        # IO so fast that it hit a second split before the first split
+        # was complete, it could violate mds_bal_fragment_size_max -- there
+        # is a window where the child dirfrags of a split are unfrozen
+        # (so they can grow), but still have STATE_FRAGMENTING (so they
+        # can't be split).
+
+        # By writing 4x the split size when the split bits are set
+        # to 3 (i.e. 4-ways), I am reasonably sure to see precisely
+        # one split.  The test is to check whether that split
+        # happens soon enough that the client doesn't exceed
+        # 2x the split_size (the "immediate" split mode should
+        # kick in at 1.5x the split size).
+
+        self.assertEqual(self.get_splits(), 0)
+        self.mount_a.create_n_files("splitdir/file", split_size * 4)
+        self.wait_until_equal(
+            self.get_splits,
+            1,
+            reject_fn=lambda s: s > 1,
+            timeout=30
+        )
+
+    def test_deep_split(self):
+        """
+        That when the directory grows many times larger than split size,
+        the fragments get split again.
+        """
+
+        split_size = 100
+        merge_size = 1  # i.e. don't merge frag unless its empty
+        split_bits = 1
+
+        branch_factor = 2**split_bits
+
+        # Arbitrary: how many levels shall we try fragmenting before
+        # ending the test?
+        max_depth = 5
+
+        self._configure(
+            mds_bal_split_size=split_size,
+            mds_bal_merge_size=merge_size,
+            mds_bal_split_bits=split_bits
+        )
+
+        # Each iteration we will create another level of fragments.  The
+        # placement of dentries into fragments is by hashes (i.e. pseudo
+        # random), so we rely on statistics to get the behaviour that
+        # by writing about 1.5x as many dentries as the split_size times
+        # the number of frags, we will get them all to exceed their
+        # split size and trigger a split.
+        depth = 0
+        files_written = 0
+        splits_expected = 0
+        while depth < max_depth:
+            log.info("Writing files for depth {0}".format(depth))
+            target_files = branch_factor**depth * int(split_size * 1.5)
+            create_files = target_files - files_written
+
+            self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
+                "{0} Writing {1} files (depth={2})".format(
+                    self.__class__.__name__, create_files, depth
+                ))
+            self.mount_a.create_n_files("splitdir/file_{0}".format(depth),
+                                        create_files)
+            self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
+                "{0} Done".format(self.__class__.__name__))
+
+            files_written += create_files
+            log.info("Now have {0} files".format(files_written))
+
+            splits_expected += branch_factor**depth
+            log.info("Waiting to see {0} splits".format(splits_expected))
+            try:
+                self.wait_until_equal(
+                    self.get_splits,
+                    splits_expected,
+                    timeout=30,
+                    reject_fn=lambda x: x > splits_expected
+                )
+
+                frags = self.get_dir_ino("/splitdir")['dirfrags']
+                self.assertEqual(len(frags), branch_factor**(depth+1))
+                self.assertEqual(
+                    sum([len(f['dentries']) for f in frags]),
+                    target_files
+                )
+            except:
+                # On failures, log what fragmentation we actually ended
+                # up with.  This block is just for logging, at the end
+                # we raise the exception again.
+                frags = self.get_dir_ino("/splitdir")['dirfrags']
+                log.info("depth={0} splits_expected={1} files_written={2}".format(
+                    depth, splits_expected, files_written
+                ))
+                log.info("Dirfrags:")
+                for f in frags:
+                    log.info("{0}: {1}".format(
+                        f['dirfrag'], len(f['dentries'])
+                    ))
+                raise
+
+            depth += 1
+
+        # Remember the inode number because we will be checking for
+        # objects later.
+        dir_inode_no = self.mount_a.path_to_ino("splitdir")
+
+        self.mount_a.run_shell(["rm", "-rf", "splitdir/"])
+        self.mount_a.umount_wait()
+
+        self.fs.mds_asok(['flush', 'journal'])
+
+        # Wait for all strays to purge
+        self.wait_until_equal(
+            lambda: self.fs.mds_asok(['perf', 'dump', 'mds_cache']
+                                     )['mds_cache']['num_strays'],
+            0,
+            timeout=1200
+        )
+        # Check that the metadata pool objects for all the myriad
+        # child fragments are gone
+        metadata_objs = self.fs.rados(["ls"])
+        frag_objs = []
+        for o in metadata_objs:
+            if o.startswith("{0:x}.".format(dir_inode_no)):
+                frag_objs.append(o)
+        self.assertListEqual(frag_objs, [])
diff --git a/qa/tasks/cephfs/test_full.py b/qa/tasks/cephfs/test_full.py
new file mode 100644
index 00000000..21470c87
--- /dev/null
+++ b/qa/tasks/cephfs/test_full.py
@@ -0,0 +1,398 @@
+
+
+import json
+import logging
+import os
+from textwrap import dedent
+import time
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+
+log = logging.getLogger(__name__)
+
+
+class FullnessTestCase(CephFSTestCase):
+    CLIENTS_REQUIRED = 2
+
+    # Subclasses define whether they're filling whole cluster or just data pool
+    data_only = False
+
+    # Subclasses define how many bytes should be written to achieve fullness
+    pool_capacity = None
+    fill_mb = None
+
+    # Subclasses define what fullness means to them
+    def is_full(self):
+        raise NotImplementedError()
+
+    def setUp(self):
+        CephFSTestCase.setUp(self)
+
+        mds_status = self.fs.rank_asok(["status"])
+
+        # Capture the initial OSD map epoch for later use
+        self.initial_osd_epoch = mds_status['osdmap_epoch_barrier']
+
+    def test_barrier(self):
+        """
+        That when an OSD epoch barrier is set on an MDS, subsequently
+        issued capabilities cause clients to update their OSD map to that
+        epoch.
+        """
+
+        # script that sync up client with MDS OSD map barrier. The barrier should
+        # be updated by cap flush ack message.
+        pyscript = dedent("""
+            import os
+            fd = os.open("{path}", os.O_CREAT | os.O_RDWR, 0O600)
+            os.fchmod(fd, 0O666)
+            os.fsync(fd)
+            os.close(fd)
+            """)
+
+        # Sync up client with initial MDS OSD map barrier.
+        path = os.path.join(self.mount_a.mountpoint, "foo")
+        self.mount_a.run_python(pyscript.format(path=path))
+
+        # Grab mounts' initial OSD epochs: later we will check that
+        # it hasn't advanced beyond this point.
+        mount_a_initial_epoch, mount_a_initial_barrier = self.mount_a.get_osd_epoch()
+
+        # Freshly mounted at start of test, should be up to date with OSD map
+        self.assertGreaterEqual(mount_a_initial_epoch, self.initial_osd_epoch)
+
+        # Set and unset a flag to cause OSD epoch to increment
+        self.fs.mon_manager.raw_cluster_cmd("osd", "set", "pause")
+        self.fs.mon_manager.raw_cluster_cmd("osd", "unset", "pause")
+
+        out = self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json").strip()
+        new_epoch = json.loads(out)['epoch']
+        self.assertNotEqual(self.initial_osd_epoch, new_epoch)
+
+        # Do a metadata operation on clients, witness that they end up with
+        # the old OSD map from startup time (nothing has prompted client
+        # to update its map)
+        path = os.path.join(self.mount_a.mountpoint, "foo")
+        self.mount_a.run_python(pyscript.format(path=path))
+        mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch()
+        self.assertEqual(mount_a_epoch, mount_a_initial_epoch)
+        self.assertEqual(mount_a_barrier, mount_a_initial_barrier)
+
+        # Set a barrier on the MDS
+        self.fs.rank_asok(["osdmap", "barrier", new_epoch.__str__()])
+
+        # Sync up client with new MDS OSD map barrier
+        path = os.path.join(self.mount_a.mountpoint, "baz")
+        self.mount_a.run_python(pyscript.format(path=path))
+        mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch()
+        self.assertEqual(mount_a_barrier, new_epoch)
+
+        # Some time passes here because the metadata part of the operation
+        # completes immediately, while the resulting OSD map update happens
+        # asynchronously (it's an Objecter::_maybe_request_map) as a result
+        # of seeing the new epoch barrier.
+        self.wait_until_true(
+            lambda: self.mount_a.get_osd_epoch()[0] >= new_epoch,
+            timeout=30)
+
+    def _data_pool_name(self):
+        data_pool_names = self.fs.get_data_pool_names()
+        if len(data_pool_names) > 1:
+            raise RuntimeError("This test can't handle multiple data pools")
+        else:
+            return data_pool_names[0]
+
+    def _test_full(self, easy_case):
+        """
+        - That a client trying to write data to a file is prevented
+        from doing so with an -EFULL result
+        - That they are also prevented from creating new files by the MDS.
+        - That they may delete another file to get the system healthy again
+
+        :param easy_case: if true, delete a successfully written file to
+                          free up space.  else, delete the file that experienced
+                          the failed write.
+        """
+
+        osd_mon_report_interval = int(self.fs.get_config("osd_mon_report_interval", service_type='osd'))
+
+        log.info("Writing {0}MB should fill this cluster".format(self.fill_mb))
+
+        # Fill up the cluster.  This dd may or may not fail, as it depends on
+        # how soon the cluster recognises its own fullness
+        self.mount_a.write_n_mb("large_file_a", self.fill_mb // 2)
+        try:
+            self.mount_a.write_n_mb("large_file_b", self.fill_mb // 2)
+        except CommandFailedError:
+            log.info("Writing file B failed (full status happened already)")
+            assert self.is_full()
+        else:
+            log.info("Writing file B succeeded (full status will happen soon)")
+            self.wait_until_true(lambda: self.is_full(),
+                                 timeout=osd_mon_report_interval * 5)
+
+        # Attempting to write more data should give me ENOSPC
+        with self.assertRaises(CommandFailedError) as ar:
+            self.mount_a.write_n_mb("large_file_b", 50, seek=self.fill_mb // 2)
+        self.assertEqual(ar.exception.exitstatus, 1)  # dd returns 1 on "No space"
+
+        # Wait for the MDS to see the latest OSD map so that it will reliably
+        # be applying the policy of rejecting non-deletion metadata operations
+        # while in the full state.
+        osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
+        self.wait_until_true(
+            lambda: self.fs.rank_asok(['status'])['osdmap_epoch'] >= osd_epoch,
+            timeout=10)
+
+        if not self.data_only:
+            with self.assertRaises(CommandFailedError):
+                self.mount_a.write_n_mb("small_file_1", 0)
+
+        # Clear out some space
+        if easy_case:
+            self.mount_a.run_shell(['rm', '-f', 'large_file_a'])
+            self.mount_a.run_shell(['rm', '-f', 'large_file_b'])
+        else:
+            # In the hard case it is the file that filled the system.
+            # Before the new #7317 (ENOSPC, epoch barrier) changes, this
+            # would fail because the last objects written would be
+            # stuck in the client cache as objecter operations.
+            self.mount_a.run_shell(['rm', '-f', 'large_file_b'])
+            self.mount_a.run_shell(['rm', '-f', 'large_file_a'])
+
+        # Here we are waiting for two things to happen:
+        # * The MDS to purge the stray folder and execute object deletions
+        #  * The OSDs to inform the mon that they are no longer full
+        self.wait_until_true(lambda: not self.is_full(),
+                             timeout=osd_mon_report_interval * 5)
+
+        # Wait for the MDS to see the latest OSD map so that it will reliably
+        # be applying the free space policy
+        osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
+        self.wait_until_true(
+            lambda: self.fs.rank_asok(['status'])['osdmap_epoch'] >= osd_epoch,
+            timeout=10)
+
+        # Now I should be able to write again
+        self.mount_a.write_n_mb("large_file", 50, seek=0)
+
+        # Ensure that the MDS keeps its OSD epoch barrier across a restart
+
+    def test_full_different_file(self):
+        self._test_full(True)
+
+    def test_full_same_file(self):
+        self._test_full(False)
+
+    def _remote_write_test(self, template):
+        """
+        Run some remote python in a way that's useful for
+        testing free space behaviour (see test_* methods using this)
+        """
+        file_path = os.path.join(self.mount_a.mountpoint, "full_test_file")
+
+        # Enough to trip the full flag
+        osd_mon_report_interval = int(self.fs.get_config("osd_mon_report_interval", service_type='osd'))
+        mon_tick_interval = int(self.fs.get_config("mon_tick_interval", service_type="mon"))
+
+        # Sufficient data to cause RADOS cluster to go 'full'
+        log.info("pool capacity {0}, {1}MB should be enough to fill it".format(self.pool_capacity, self.fill_mb))
+
+        # Long enough for RADOS cluster to notice it is full and set flag on mons
+        # (report_interval for mon to learn PG stats, tick interval for it to update OSD map,
+        #  factor of 1.5 for I/O + network latency in committing OSD map and distributing it
+        #  to the OSDs)
+        full_wait = (osd_mon_report_interval + mon_tick_interval) * 1.5
+
+        # Configs for this test should bring this setting down in order to
+        # run reasonably quickly
+        if osd_mon_report_interval > 10:
+            log.warning("This test may run rather slowly unless you decrease"
+                     "osd_mon_report_interval (5 is a good setting)!")
+
+        self.mount_a.run_python(template.format(
+            fill_mb=self.fill_mb,
+            file_path=file_path,
+            full_wait=full_wait,
+            is_fuse=isinstance(self.mount_a, FuseMount)
+        ))
+
+    def test_full_fclose(self):
+        # A remote script which opens a file handle, fills up the filesystem, and then
+        # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
+        remote_script = dedent("""
+            import time
+            import datetime
+            import subprocess
+            import os
+
+            # Write some buffered data through before going full, all should be well
+            print("writing some data through which we expect to succeed")
+            bytes = 0
+            f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
+            bytes += os.write(f, b'a' * 512 * 1024)
+            os.fsync(f)
+            print("fsync'ed data successfully, will now attempt to fill fs")
+
+            # Okay, now we're going to fill up the filesystem, and then keep
+            # writing until we see an error from fsync.  As long as we're doing
+            # buffered IO, the error should always only appear from fsync and not
+            # from write
+            full = False
+
+            for n in range(0, int({fill_mb} * 0.9)):
+                bytes += os.write(f, b'x' * 1024 * 1024)
+                print("wrote {{0}} bytes via buffered write, may repeat".format(bytes))
+            print("done writing {{0}} bytes".format(bytes))
+
+            # OK, now we should sneak in under the full condition
+            # due to the time it takes the OSDs to report to the
+            # mons, and get a successful fsync on our full-making data
+            os.fsync(f)
+            print("successfully fsync'ed prior to getting full state reported")
+
+            # buffered write, add more dirty data to the buffer
+            print("starting buffered write")
+            try:
+                for n in range(0, int({fill_mb} * 0.2)):
+                    bytes += os.write(f, b'x' * 1024 * 1024)
+                    print("sleeping a bit as we've exceeded 90% of our expected full ratio")
+                    time.sleep({full_wait})
+            except OSError:
+                pass;
+
+            print("wrote, now waiting 30s and then doing a close we expect to fail")
+
+            # Wait long enough for a background flush that should fail
+            time.sleep(30)
+
+            if {is_fuse}:
+                # ...and check that the failed background flush is reflected in fclose
+                try:
+                    os.close(f)
+                except OSError:
+                    print("close() returned an error as expected")
+                else:
+                    raise RuntimeError("close() failed to raise error")
+            else:
+                # The kernel cephfs client does not raise errors on fclose
+                os.close(f)
+
+            os.unlink("{file_path}")
+            """)
+        self._remote_write_test(remote_script)
+
+    def test_full_fsync(self):
+        """
+        That when the full flag is encountered during asynchronous
+        flushes, such that an fwrite() succeeds but an fsync/fclose()
+        should return the ENOSPC error.
+        """
+
+        # A remote script which opens a file handle, fills up the filesystem, and then
+        # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
+        remote_script = dedent("""
+            import time
+            import datetime
+            import subprocess
+            import os
+
+            # Write some buffered data through before going full, all should be well
+            print("writing some data through which we expect to succeed")
+            bytes = 0
+            f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
+            bytes += os.write(f, b'a' * 4096)
+            os.fsync(f)
+            print("fsync'ed data successfully, will now attempt to fill fs")
+
+            # Okay, now we're going to fill up the filesystem, and then keep
+            # writing until we see an error from fsync.  As long as we're doing
+            # buffered IO, the error should always only appear from fsync and not
+            # from write
+            full = False
+
+            for n in range(0, int({fill_mb} * 1.1)):
+                try:
+                    bytes += os.write(f, b'x' * 1024 * 1024)
+                    print("wrote bytes via buffered write, moving on to fsync")
+                except OSError as e:
+                    print("Unexpected error %s from write() instead of fsync()" % e)
+                    raise
+
+                try:
+                    os.fsync(f)
+                    print("fsync'ed successfully")
+                except OSError as e:
+                    print("Reached fullness after %.2f MB" % (bytes / (1024.0 * 1024.0)))
+                    full = True
+                    break
+                else:
+                    print("Not full yet after %.2f MB" % (bytes / (1024.0 * 1024.0)))
+
+                if n > {fill_mb} * 0.9:
+                    # Be cautious in the last region where we expect to hit
+                    # the full condition, so that we don't overshoot too dramatically
+                    print("sleeping a bit as we've exceeded 90% of our expected full ratio")
+                    time.sleep({full_wait})
+
+            if not full:
+                raise RuntimeError("Failed to reach fullness after writing %d bytes" % bytes)
+
+            # close() should not raise an error because we already caught it in
+            # fsync.  There shouldn't have been any more writeback errors
+            # since then because all IOs got cancelled on the full flag.
+            print("calling close")
+            os.close(f)
+            print("close() did not raise error")
+
+            os.unlink("{file_path}")
+            """)
+
+        self._remote_write_test(remote_script)
+
+
+class TestQuotaFull(FullnessTestCase):
+    """
+    Test per-pool fullness, which indicates quota limits exceeded
+    """
+    pool_capacity = 1024 * 1024 * 32   # arbitrary low-ish limit
+    fill_mb = pool_capacity // (1024 * 1024)
+
+    # We are only testing quota handling on the data pool, not the metadata
+    # pool.
+    data_only = True
+
+    def setUp(self):
+        super(TestQuotaFull, self).setUp()
+
+        pool_name = self.fs.get_data_pool_name()
+        self.fs.mon_manager.raw_cluster_cmd("osd", "pool", "set-quota", pool_name,
+                                            "max_bytes", "{0}".format(self.pool_capacity))
+
+    def is_full(self):
+        return self.fs.is_full()
+
+
+class TestClusterFull(FullnessTestCase):
+    """
+    Test data pool fullness, which indicates that an OSD has become too full
+    """
+    pool_capacity = None
+    REQUIRE_MEMSTORE = True
+
+    def setUp(self):
+        super(TestClusterFull, self).setUp()
+
+        if self.pool_capacity is None:
+            max_avail = self.fs.get_pool_df(self._data_pool_name())['max_avail']
+            full_ratio = float(self.fs.get_config("mon_osd_full_ratio", service_type="mon"))
+            TestClusterFull.pool_capacity = int(max_avail * full_ratio)
+            TestClusterFull.fill_mb = (self.pool_capacity // (1024 * 1024))
+
+    def is_full(self):
+        return self.fs.is_full()
+
+# Hide the parent class so that unittest.loader doesn't try to run it.
+del globals()['FullnessTestCase']
diff --git a/qa/tasks/cephfs/test_journal_migration.py b/qa/tasks/cephfs/test_journal_migration.py
new file mode 100644
index 00000000..8863b371
--- /dev/null
+++ b/qa/tasks/cephfs/test_journal_migration.py
@@ -0,0 +1,100 @@
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from tasks.workunit import task as workunit
+
+JOURNAL_FORMAT_LEGACY = 0
+JOURNAL_FORMAT_RESILIENT = 1
+
+
+class TestJournalMigration(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 2
+
+    def test_journal_migration(self):
+        old_journal_version = JOURNAL_FORMAT_LEGACY
+        new_journal_version = JOURNAL_FORMAT_RESILIENT
+
+        self.mount_a.umount_wait()
+        self.fs.mds_stop()
+
+        # Create a filesystem using the older journal format.
+        self.fs.set_ceph_conf('mds', 'mds journal format', old_journal_version)
+        self.fs.mds_restart()
+        self.fs.recreate()
+
+        # Enable standby replay, to cover the bug case #8811 where
+        # a standby replay might mistakenly end up trying to rewrite
+        # the journal at the same time as an active daemon.
+        self.fs.set_allow_standby_replay(True)
+
+        status = self.fs.wait_for_daemons()
+
+        self.assertTrue(self.fs.get_replay(status=status) is not None)
+
+        # Do some client work so that the log is populated with something.
+        with self.mount_a.mounted():
+            self.mount_a.create_files()
+            self.mount_a.check_files()  # sanity, this should always pass
+
+            # Run a more substantial workunit so that the length of the log to be
+            # coverted is going span at least a few segments
+            workunit(self.ctx, {
+                'clients': {
+                    "client.{0}".format(self.mount_a.client_id): ["suites/fsstress.sh"],
+                },
+                "timeout": "3h"
+            })
+
+        # Modify the ceph.conf to ask the MDS to use the new journal format.
+        self.fs.set_ceph_conf('mds', 'mds journal format', new_journal_version)
+
+        # Restart the MDS.
+        self.fs.mds_fail_restart()
+
+        # This ensures that all daemons come up into a valid state
+        status = self.fs.wait_for_daemons()
+
+        # Check that files created in the initial client workload are still visible
+        # in a client mount.
+        with self.mount_a.mounted():
+            self.mount_a.check_files()
+
+        # Verify that the journal really has been rewritten.
+        journal_version = self.fs.get_journal_version()
+        if journal_version != new_journal_version:
+            raise RuntimeError("Journal was not upgraded, version should be {0} but is {1}".format(
+                new_journal_version, journal_version()
+            ))
+
+        # Verify that cephfs-journal-tool can now read the rewritten journal
+        inspect_out = self.fs.journal_tool(["journal", "inspect"], 0)
+        if not inspect_out.endswith(": OK"):
+            raise RuntimeError("Unexpected journal-tool result: '{0}'".format(
+                inspect_out
+            ))
+
+        self.fs.journal_tool(["event", "get", "json",
+                              "--path", "/tmp/journal.json"], 0)
+        p = self.fs.tool_remote.sh([
+                "python3",
+                "-c",
+                "import json; print(len(json.load(open('/tmp/journal.json'))))"
+            ])
+        event_count = int(p.strip())
+        if event_count < 1000:
+            # Approximate value of "lots", expected from having run fsstress
+            raise RuntimeError("Unexpectedly few journal events: {0}".format(event_count))
+
+        # Do some client work to check that writing the log is still working
+        with self.mount_a.mounted():
+            workunit(self.ctx, {
+                'clients': {
+                    "client.{0}".format(self.mount_a.client_id): ["fs/misc/trivial_sync.sh"],
+                },
+                "timeout": "3h"
+            })
+
+        # Check that both an active and a standby replay are still up
+        status = self.fs.status()
+        self.assertEqual(len(list(self.fs.get_replays(status=status))), 1)
+        self.assertEqual(len(list(self.fs.get_ranks(status=status))), 1)
diff --git a/qa/tasks/cephfs/test_journal_repair.py b/qa/tasks/cephfs/test_journal_repair.py
new file mode 100644
index 00000000..a52455d7
--- /dev/null
+++ b/qa/tasks/cephfs/test_journal_repair.py
@@ -0,0 +1,447 @@
+
+"""
+Test our tools for recovering the content of damaged journals
+"""
+
+import json
+import logging
+from textwrap import dedent
+import time
+
+from teuthology.exceptions import CommandFailedError, ConnectionLostError
+from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+from tasks.workunit import task as workunit
+
+log = logging.getLogger(__name__)
+
+
+class TestJournalRepair(CephFSTestCase):
+    MDSS_REQUIRED = 2
+
+    def test_inject_to_empty(self):
+        """
+        That when some dentries in the journal but nothing is in
+        the backing store, we correctly populate the backing store
+        from the journalled dentries.
+        """
+
+        # Inject metadata operations
+        self.mount_a.run_shell(["touch", "rootfile"])
+        self.mount_a.run_shell(["mkdir", "subdir"])
+        self.mount_a.run_shell(["touch", "subdir/subdirfile"])
+        # There are several different paths for handling hardlinks, depending
+        # on whether an existing dentry (being overwritten) is also a hardlink
+        self.mount_a.run_shell(["mkdir", "linkdir"])
+
+        # Test inode -> remote transition for a dentry
+        self.mount_a.run_shell(["touch", "linkdir/link0"])
+        self.mount_a.run_shell(["rm", "-f", "linkdir/link0"])
+        self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link0"])
+
+        # Test nothing -> remote transition
+        self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link1"])
+
+        # Test remote -> inode transition
+        self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link2"])
+        self.mount_a.run_shell(["rm", "-f", "linkdir/link2"])
+        self.mount_a.run_shell(["touch", "linkdir/link2"])
+
+        # Test remote -> diff remote transition
+        self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link3"])
+        self.mount_a.run_shell(["rm", "-f", "linkdir/link3"])
+        self.mount_a.run_shell(["ln", "rootfile", "linkdir/link3"])
+
+        # Test an empty directory
+        self.mount_a.run_shell(["mkdir", "subdir/subsubdir"])
+        self.mount_a.run_shell(["sync"])
+
+        # Before we unmount, make a note of the inode numbers, later we will
+        # check that they match what we recover from the journal
+        rootfile_ino = self.mount_a.path_to_ino("rootfile")
+        subdir_ino = self.mount_a.path_to_ino("subdir")
+        linkdir_ino = self.mount_a.path_to_ino("linkdir")
+        subdirfile_ino = self.mount_a.path_to_ino("subdir/subdirfile")
+        subsubdir_ino = self.mount_a.path_to_ino("subdir/subsubdir")
+
+        self.mount_a.umount_wait()
+
+        # Stop the MDS
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # Now, the journal should contain the operations, but the backing
+        # store shouldn't
+        with self.assertRaises(ObjectNotFound):
+            self.fs.list_dirfrag(subdir_ino)
+        self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
+
+        # Execute the dentry recovery, this should populate the backing store
+        self.fs.journal_tool(['event', 'recover_dentries', 'list'], 0)
+
+        # Dentries in ROOT_INO are present
+        self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head']))
+        self.assertEqual(self.fs.list_dirfrag(subdir_ino), ['subdirfile_head', 'subsubdir_head'])
+        self.assertEqual(sorted(self.fs.list_dirfrag(linkdir_ino)),
+                         sorted(['link0_head', 'link1_head', 'link2_head', 'link3_head']))
+
+        # Now check the MDS can read what we wrote: truncate the journal
+        # and start the mds.
+        self.fs.journal_tool(['journal', 'reset'], 0)
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        # List files
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        # First ls -R to populate MDCache, such that hardlinks will
+        # resolve properly (recover_dentries does not create backtraces,
+        # so ordinarily hardlinks to inodes that happen not to have backtraces
+        # will be invisible in readdir).
+        # FIXME: hook in forward scrub here to regenerate backtraces
+        proc = self.mount_a.run_shell(['ls', '-R'])
+        self.mount_a.umount_wait()  # remount to clear client cache before our second ls
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        proc = self.mount_a.run_shell(['ls', '-R'])
+        self.assertEqual(proc.stdout.getvalue().strip(),
+                         dedent("""
+                         .:
+                         linkdir
+                         rootfile
+                         subdir
+
+                         ./linkdir:
+                         link0
+                         link1
+                         link2
+                         link3
+
+                         ./subdir:
+                         subdirfile
+                         subsubdir
+
+                         ./subdir/subsubdir:
+                         """).strip())
+
+        # Check the correct inos were preserved by path
+        self.assertEqual(rootfile_ino, self.mount_a.path_to_ino("rootfile"))
+        self.assertEqual(subdir_ino, self.mount_a.path_to_ino("subdir"))
+        self.assertEqual(subdirfile_ino, self.mount_a.path_to_ino("subdir/subdirfile"))
+        self.assertEqual(subsubdir_ino, self.mount_a.path_to_ino("subdir/subsubdir"))
+
+        # Check that the hard link handling came out correctly
+        self.assertEqual(self.mount_a.path_to_ino("linkdir/link0"), subdirfile_ino)
+        self.assertEqual(self.mount_a.path_to_ino("linkdir/link1"), subdirfile_ino)
+        self.assertNotEqual(self.mount_a.path_to_ino("linkdir/link2"), subdirfile_ino)
+        self.assertEqual(self.mount_a.path_to_ino("linkdir/link3"), rootfile_ino)
+
+        # Create a new file, ensure it is not issued the same ino as one of the
+        # recovered ones
+        self.mount_a.run_shell(["touch", "afterwards"])
+        new_ino = self.mount_a.path_to_ino("afterwards")
+        self.assertNotIn(new_ino, [rootfile_ino, subdir_ino, subdirfile_ino])
+
+        # Check that we can do metadata ops in the recovered directory
+        self.mount_a.run_shell(["touch", "subdir/subsubdir/subsubdirfile"])
+
+    @for_teuthology # 308s
+    def test_reset(self):
+        """
+        That after forcibly modifying the backing store, we can get back into
+        a good state by resetting the MDSMap.
+
+        The scenario is that we have two active MDSs, and we lose the journals.  Once
+        we have completely lost confidence in the integrity of the metadata, we want to
+        return the system to a single-MDS state to go into a scrub to recover what we
+        can.
+        """
+
+        # Set max_mds to 2
+        self.fs.set_max_mds(2)
+
+        # See that we have two active MDSs
+        self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
+                              reject_fn=lambda v: v > 2 or v < 1)
+        active_mds_names = self.fs.get_active_names()
+
+        # Switch off any unneeded MDS daemons
+        for unneeded_mds in set(self.mds_cluster.mds_ids) - set(active_mds_names):
+            self.mds_cluster.mds_stop(unneeded_mds)
+            self.mds_cluster.mds_fail(unneeded_mds)
+
+        # Create a dir on each rank
+        self.mount_a.run_shell(["mkdir", "alpha"])
+        self.mount_a.run_shell(["mkdir", "bravo"])
+        self.mount_a.setfattr("alpha/", "ceph.dir.pin", "0")
+        self.mount_a.setfattr("bravo/", "ceph.dir.pin", "1")
+
+        def subtrees_assigned():
+            got_subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=active_mds_names[0])
+
+            for s in got_subtrees:
+                if s['dir']['path'] == '/bravo':
+                    if s['auth_first'] == 1:
+                        return True
+                    else:
+                        # Should not happen
+                        raise RuntimeError("/bravo is subtree but not rank 1!")
+
+            return False
+
+        # Ensure the pinning has taken effect and the /bravo dir is now
+        # migrated to rank 1.
+        self.wait_until_true(subtrees_assigned, 30)
+
+        # Do some IO (this should be split across ranks according to
+        # the rank-pinned dirs)
+        self.mount_a.create_n_files("alpha/file", 1000)
+        self.mount_a.create_n_files("bravo/file", 1000)
+
+        # Flush the journals so that we have some backing store data
+        # belonging to one MDS, and some to the other MDS.
+        for mds_name in active_mds_names:
+            self.fs.mds_asok(["flush", "journal"], mds_name)
+
+        # Stop (hard) the second MDS daemon
+        self.fs.mds_stop(active_mds_names[1])
+
+        # Wipe out the tables for MDS rank 1 so that it is broken and can't start
+        # (this is the simulated failure that we will demonstrate that the disaster
+        #  recovery tools can get us back from)
+        self.fs.erase_metadata_objects(prefix="mds1_")
+
+        # Try to access files from the client
+        blocked_ls = self.mount_a.run_shell(["ls", "-R"], wait=False)
+
+        # Check that this "ls -R" blocked rather than completing: indicates
+        # it got stuck trying to access subtrees which were on the now-dead MDS.
+        log.info("Sleeping to check ls is blocked...")
+        time.sleep(60)
+        self.assertFalse(blocked_ls.finished)
+
+        # This mount is now useless because it will depend on MDS rank 1, and MDS rank 1
+        # is not coming back.  Kill it.
+        log.info("Killing mount, it's blocked on the MDS we killed")
+        self.mount_a.kill()
+        self.mount_a.kill_cleanup()
+        try:
+            # Now that the mount is dead, the ls -R should error out.
+            blocked_ls.wait()
+        except (CommandFailedError, ConnectionLostError):
+            # The ConnectionLostError case is for kernel client, where
+            # killing the mount also means killing the node.
+            pass
+
+        # See that the second MDS will crash when it starts and tries to
+        # acquire rank 1
+        damaged_id = active_mds_names[1]
+        self.fs.mds_restart(damaged_id)
+
+        # The daemon taking the damaged rank should start starting, then
+        # restart back into standby after asking the mon to mark the rank
+        # damaged.
+        def is_marked_damaged():
+            mds_map = self.fs.get_mds_map()
+            return 1 in mds_map['damaged']
+
+        self.wait_until_true(is_marked_damaged, 60)
+
+        def get_state():
+            info = self.mds_cluster.get_mds_info(damaged_id)
+            return info['state'] if info is not None else None
+
+        self.wait_until_equal(
+                get_state,
+                "up:standby",
+                timeout=60)
+
+        self.fs.mds_stop(damaged_id)
+        self.fs.mds_fail(damaged_id)
+
+        # Now give up and go through a disaster recovery procedure
+        self.fs.mds_stop(active_mds_names[0])
+        self.fs.mds_fail(active_mds_names[0])
+        # Invoke recover_dentries quietly, because otherwise log spews millions of lines
+        self.fs.journal_tool(["event", "recover_dentries", "summary"], 0, quiet=True)
+        self.fs.journal_tool(["event", "recover_dentries", "summary"], 1, quiet=True)
+        self.fs.table_tool(["0", "reset", "session"])
+        self.fs.journal_tool(["journal", "reset"], 0)
+        self.fs.erase_mds_objects(1)
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
+                '--yes-i-really-mean-it')
+
+        # Bring an MDS back online, mount a client, and see that we can walk the full
+        # filesystem tree again
+        self.fs.mds_fail_restart(active_mds_names[0])
+        self.wait_until_equal(lambda: self.fs.get_active_names(), [active_mds_names[0]], 30,
+                              reject_fn=lambda v: len(v) > 1)
+        self.mount_a.mount()
+        self.mount_a.run_shell(["ls", "-R"], wait=True)
+
+    def test_table_tool(self):
+        active_mdss = self.fs.get_active_names()
+        self.assertEqual(len(active_mdss), 1)
+        mds_name = active_mdss[0]
+
+        self.mount_a.run_shell(["touch", "foo"])
+        self.fs.mds_asok(["flush", "journal"], mds_name)
+
+        log.info(self.fs.table_tool(["all", "show", "inode"]))
+        log.info(self.fs.table_tool(["all", "show", "snap"]))
+        log.info(self.fs.table_tool(["all", "show", "session"]))
+
+        # Inode table should always be the same because initial state
+        # and choice of inode are deterministic.
+        # Should see one inode consumed
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "show", "inode"])),
+            {"0": {
+                "data": {
+                    "version": 2,
+                    "inotable": {
+                        "projected_free": [
+                            {"start": 1099511628777,
+                             "len": 1099511626775}],
+                        "free": [
+                            {"start": 1099511628777,
+                             "len": 1099511626775}]}},
+                "result": 0}}
+
+        )
+
+        # Should see one session
+        session_data = json.loads(self.fs.table_tool(
+            ["all", "show", "session"]))
+        self.assertEqual(len(session_data["0"]["data"]["sessions"]), 1)
+        self.assertEqual(session_data["0"]["result"], 0)
+
+        # Should see no snaps
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "show", "snap"])),
+            {"version": 1,
+             "snapserver": {"last_snap": 1,
+                            "last_created": 1,
+                            "last_destroyed": 1,
+                            "pending_noop": [],
+                            "snaps": [],
+                            "need_to_purge": {},
+                            "pending_update": [],
+                            "pending_destroy": []},
+             "result": 0}
+        )
+
+        # Reset everything
+        for table in ["session", "inode", "snap"]:
+            self.fs.table_tool(["all", "reset", table])
+
+        log.info(self.fs.table_tool(["all", "show", "inode"]))
+        log.info(self.fs.table_tool(["all", "show", "snap"]))
+        log.info(self.fs.table_tool(["all", "show", "session"]))
+
+        # Should see 0 sessions
+        session_data = json.loads(self.fs.table_tool(
+            ["all", "show", "session"]))
+        self.assertEqual(len(session_data["0"]["data"]["sessions"]), 0)
+        self.assertEqual(session_data["0"]["result"], 0)
+
+        # Should see entire inode range now marked free
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "show", "inode"])),
+            {"0": {"data": {"version": 1,
+                            "inotable": {"projected_free": [
+                                {"start": 1099511627776,
+                                 "len": 1099511627776}],
+                                 "free": [
+                                    {"start": 1099511627776,
+                                    "len": 1099511627776}]}},
+                   "result": 0}}
+        )
+
+        # Should see no snaps
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "show", "snap"])),
+            {"version": 1,
+             "snapserver": {"last_snap": 1,
+                            "last_created": 1,
+                            "last_destroyed": 1,
+                            "pending_noop": [],
+                            "snaps": [],
+                            "need_to_purge": {},
+                            "pending_update": [],
+                            "pending_destroy": []},
+             "result": 0}
+        )
+
+    def test_table_tool_take_inos(self):
+        initial_range_start = 1099511627776
+        initial_range_len = 1099511627776
+        # Initially a completely clear range
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "show", "inode"])),
+            {"0": {"data": {"version": 0,
+                            "inotable": {"projected_free": [
+                                {"start": initial_range_start,
+                                 "len": initial_range_len}],
+                                "free": [
+                                    {"start": initial_range_start,
+                                     "len": initial_range_len}]}},
+                   "result": 0}}
+        )
+
+        # Remove some
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "take_inos", "{0}".format(initial_range_start + 100)])),
+            {"0": {"data": {"version": 1,
+                            "inotable": {"projected_free": [
+                                {"start": initial_range_start + 101,
+                                 "len": initial_range_len - 101}],
+                                "free": [
+                                    {"start": initial_range_start + 101,
+                                     "len": initial_range_len - 101}]}},
+                   "result": 0}}
+        )
+
+    @for_teuthology  # Hack: "for_teuthology" because .sh doesn't work outside teuth
+    def test_journal_smoke(self):
+        workunit(self.ctx, {
+            'clients': {
+                "client.{0}".format(self.mount_a.client_id): [
+                    "fs/misc/trivial_sync.sh"],
+            },
+            "timeout": "1h"
+        })
+
+        for mount in self.mounts:
+            mount.umount_wait()
+
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # journal tool smoke
+        workunit(self.ctx, {
+            'clients': {
+                "client.{0}".format(self.mount_a.client_id): [
+                    "suites/cephfs_journal_tool_smoke.sh"],
+            },
+            "timeout": "1h"
+        })
+
+
+
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+
+        self.mount_a.mount()
+
+        # trivial sync moutn a
+        workunit(self.ctx, {
+            'clients': {
+                "client.{0}".format(self.mount_a.client_id): [
+                    "fs/misc/trivial_sync.sh"],
+            },
+            "timeout": "1h"
+        })
+
diff --git a/qa/tasks/cephfs/test_mantle.py b/qa/tasks/cephfs/test_mantle.py
new file mode 100644
index 00000000..6cd86ad1
--- /dev/null
+++ b/qa/tasks/cephfs/test_mantle.py
@@ -0,0 +1,109 @@
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+import json
+import logging
+
+log = logging.getLogger(__name__)
+failure = "using old balancer; mantle failed for balancer="
+success = "mantle balancer version changed: "
+
+class TestMantle(CephFSTestCase):
+    def start_mantle(self):
+        self.wait_for_health_clear(timeout=30)
+        self.fs.set_max_mds(2)
+        self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
+                              reject_fn=lambda v: v > 2 or v < 1)
+
+        for m in self.fs.get_active_names():
+            self.fs.mds_asok(['config', 'set', 'debug_objecter', '20'], mds_id=m)
+            self.fs.mds_asok(['config', 'set', 'debug_ms', '0'], mds_id=m)
+            self.fs.mds_asok(['config', 'set', 'debug_mds', '0'], mds_id=m)
+            self.fs.mds_asok(['config', 'set', 'debug_mds_balancer', '5'], mds_id=m)
+
+    def push_balancer(self, obj, lua_code, expect):
+        self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', obj)
+        self.fs.rados(["put", obj, "-"], stdin_data=lua_code)
+        with self.assert_cluster_log(failure + obj + " " + expect):
+            log.info("run a " + obj + " balancer that expects=" + expect)
+
+    def test_version_empty(self):
+        self.start_mantle()
+        expect = " : (2) No such file or directory"
+
+        ret = self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer')
+        assert(ret == 22) # EINVAL
+
+        self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', " ")
+        with self.assert_cluster_log(failure + " " + expect): pass
+
+    def test_version_not_in_rados(self):
+        self.start_mantle()
+        expect = failure + "ghost.lua : (2) No such file or directory"
+        self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "ghost.lua")
+        with self.assert_cluster_log(expect): pass
+
+    def test_balancer_invalid(self):
+        self.start_mantle()
+        expect = ": (22) Invalid argument"
+
+        lua_code = "this is invalid lua code!"
+        self.push_balancer("invalid.lua", lua_code, expect)
+
+        lua_code = "BAL_LOG()"
+        self.push_balancer("invalid_log.lua", lua_code, expect)
+
+        lua_code = "BAL_LOG(0)"
+        self.push_balancer("invalid_log_again.lua", lua_code, expect)
+
+    def test_balancer_valid(self):
+        self.start_mantle()
+        lua_code = "BAL_LOG(0, \"test\")\nreturn {3, 4}"
+        self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "valid.lua")
+        self.fs.rados(["put", "valid.lua", "-"], stdin_data=lua_code)
+        with self.assert_cluster_log(success + "valid.lua"):
+            log.info("run a valid.lua balancer")
+
+    def test_return_invalid(self):
+        self.start_mantle()
+        expect = ": (22) Invalid argument"
+
+        lua_code = "return \"hello\""
+        self.push_balancer("string.lua", lua_code, expect)
+
+        lua_code = "return 3"
+        self.push_balancer("number.lua", lua_code, expect)
+
+        lua_code = "return {}"
+        self.push_balancer("dict_empty.lua", lua_code, expect)
+
+        lua_code = "return {\"this\", \"is\", \"a\", \"test\"}"
+        self.push_balancer("dict_of_strings.lua", lua_code, expect)
+
+        lua_code = "return {3, \"test\"}"
+        self.push_balancer("dict_of_mixed.lua", lua_code, expect)
+
+        lua_code = "return {3}"
+        self.push_balancer("not_enough_numbers.lua", lua_code, expect)
+
+        lua_code = "return {3, 4, 5, 6, 7, 8, 9}"
+        self.push_balancer("too_many_numbers.lua", lua_code, expect)
+
+    def test_dead_osd(self):
+        self.start_mantle()
+        expect = " : (110) Connection timed out"
+
+        # kill the OSDs so that the balancer pull from RADOS times out
+        osd_map = json.loads(self.fs.mon_manager.raw_cluster_cmd('osd', 'dump', '--format=json-pretty'))
+        for i in range(0, len(osd_map['osds'])):
+          self.fs.mon_manager.raw_cluster_cmd_result('osd', 'down', str(i))
+          self.fs.mon_manager.raw_cluster_cmd_result('osd', 'out', str(i))
+
+        # trigger a pull from RADOS
+        self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "valid.lua")
+
+        # make the timeout a little longer since dead OSDs spam ceph -w
+        with self.assert_cluster_log(failure + "valid.lua" + expect, timeout=30):
+            log.info("run a balancer that should timeout")
+
+        # cleanup
+        for i in range(0, len(osd_map['osds'])):
+          self.fs.mon_manager.raw_cluster_cmd_result('osd', 'in', str(i))
diff --git a/qa/tasks/cephfs/test_misc.py b/qa/tasks/cephfs/test_misc.py
new file mode 100644
index 00000000..cd72ac38
--- /dev/null
+++ b/qa/tasks/cephfs/test_misc.py
@@ -0,0 +1,291 @@
+
+from unittest import SkipTest
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.orchestra.run import CommandFailedError, ConnectionLostError
+import errno
+import time
+import json
+import logging
+
+log = logging.getLogger(__name__)
+
+class TestMisc(CephFSTestCase):
+    CLIENTS_REQUIRED = 2
+
+    def test_getattr_caps(self):
+        """
+        Check if MDS recognizes the 'mask' parameter of open request.
+        The parameter allows client to request caps when opening file
+        """
+
+        if not isinstance(self.mount_a, FuseMount):
+            raise SkipTest("Require FUSE client")
+
+        # Enable debug. Client will requests CEPH_CAP_XATTR_SHARED
+        # on lookup/open
+        self.mount_b.umount_wait()
+        self.set_conf('client', 'client debug getattr caps', 'true')
+        self.mount_b.mount()
+        self.mount_b.wait_until_mounted()
+
+        # create a file and hold it open. MDS will issue CEPH_CAP_EXCL_*
+        # to mount_a
+        p = self.mount_a.open_background("testfile")
+        self.mount_b.wait_for_visible("testfile")
+
+        # this triggers a lookup request and an open request. The debug
+        # code will check if lookup/open reply contains xattrs
+        self.mount_b.run_shell(["cat", "testfile"])
+
+        self.mount_a.kill_background(p)
+
+    def test_root_rctime(self):
+        """
+        Check that the root inode has a non-default rctime on startup.
+        """
+
+        t = time.time()
+        rctime = self.mount_a.getfattr(".", "ceph.dir.rctime")
+        log.info("rctime = {}".format(rctime))
+        self.assertGreaterEqual(float(rctime), t - 10)
+
+    def test_fs_new(self):
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        data_pool_name = self.fs.get_data_pool_name()
+
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
+                                            '--yes-i-really-mean-it')
+
+        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
+                                            self.fs.metadata_pool_name,
+                                            self.fs.metadata_pool_name,
+                                            '--yes-i-really-really-mean-it')
+        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
+                                            self.fs.metadata_pool_name,
+                                            self.fs.pgs_per_fs_pool.__str__())
+
+        dummyfile = '/etc/fstab'
+
+        self.fs.put_metadata_object_raw("key", dummyfile)
+
+        def get_pool_df(fs, name):
+            try:
+                return fs.get_pool_df(name)['objects'] > 0
+            except RuntimeError:
+                return False
+
+        self.wait_until_true(lambda: get_pool_df(self.fs, self.fs.metadata_pool_name), timeout=30)
+
+        try:
+            self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name,
+                                                self.fs.metadata_pool_name,
+                                                data_pool_name)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.EINVAL)
+        else:
+            raise AssertionError("Expected EINVAL")
+
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name,
+                                            self.fs.metadata_pool_name,
+                                            data_pool_name, "--force")
+
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
+                                            '--yes-i-really-mean-it')
+
+
+        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
+                                            self.fs.metadata_pool_name,
+                                            self.fs.metadata_pool_name,
+                                            '--yes-i-really-really-mean-it')
+        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
+                                            self.fs.metadata_pool_name,
+                                            self.fs.pgs_per_fs_pool.__str__())
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name,
+                                            self.fs.metadata_pool_name,
+                                            data_pool_name)
+
+    def test_cap_revoke_nonresponder(self):
+        """
+        Check that a client is evicted if it has not responded to cap revoke
+        request for configured number of seconds.
+        """
+        session_timeout = self.fs.get_var("session_timeout")
+        eviction_timeout = session_timeout / 2.0
+
+        self.fs.mds_asok(['config', 'set', 'mds_cap_revoke_eviction_timeout',
+                          str(eviction_timeout)])
+
+        cap_holder = self.mount_a.open_background()
+
+        # Wait for the file to be visible from another client, indicating
+        # that mount_a has completed its network ops
+        self.mount_b.wait_for_visible()
+
+        # Simulate client death
+        self.mount_a.kill()
+
+        try:
+            # The waiter should get stuck waiting for the capability
+            # held on the MDS by the now-dead client A
+            cap_waiter = self.mount_b.write_background()
+
+            a = time.time()
+            time.sleep(eviction_timeout)
+            cap_waiter.wait()
+            b = time.time()
+            cap_waited = b - a
+            log.info("cap_waiter waited {0}s".format(cap_waited))
+
+            # check if the cap is transferred before session timeout kicked in.
+            # this is a good enough check to ensure that the client got evicted
+            # by the cap auto evicter rather than transitioning to stale state
+            # and then getting evicted.
+            self.assertLess(cap_waited, session_timeout,
+                            "Capability handover took {0}, expected less than {1}".format(
+                                cap_waited, session_timeout
+                            ))
+
+            self.assertTrue(self.mount_a.is_blacklisted())
+            cap_holder.stdin.close()
+            try:
+                cap_holder.wait()
+            except (CommandFailedError, ConnectionLostError):
+                # We killed it (and possibly its node), so it raises an error
+                pass
+        finally:
+            self.mount_a.kill_cleanup()
+
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+    def test_filtered_df(self):
+        pool_name = self.fs.get_data_pool_name()
+        raw_df = self.fs.get_pool_df(pool_name)
+        raw_avail = float(raw_df["max_avail"])
+        out = self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'get',
+                                                  pool_name, 'size',
+                                                  '-f', 'json-pretty')
+        _ = json.loads(out)
+
+        proc = self.mount_a.run_shell(['df', '.'])
+        output = proc.stdout.getvalue()
+        fs_avail = output.split('\n')[1].split()[3]
+        fs_avail = float(fs_avail) * 1024
+
+        ratio = raw_avail / fs_avail
+        assert 0.9 < ratio < 1.1
+
+    def test_dump_inode(self):
+        info = self.fs.mds_asok(['dump', 'inode', '1'])
+        assert(info['path'] == "/")
+
+    def test_dump_inode_hexademical(self):
+        self.mount_a.run_shell(["mkdir", "-p", "foo"])
+        ino = self.mount_a.path_to_ino("foo")
+        assert type(ino) is int
+        info = self.fs.mds_asok(['dump', 'inode', hex(ino)])
+        assert info['path'] == "/foo"
+
+
+class TestCacheDrop(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+
+    def _run_drop_cache_cmd(self, timeout=None):
+        result = None
+        mds_id = self.fs.get_lone_mds_id()
+        if timeout is not None:
+            result = self.fs.mon_manager.raw_cluster_cmd("tell", "mds.{0}".format(mds_id),
+                                                    "cache", "drop", str(timeout))
+        else:
+            result = self.fs.mon_manager.raw_cluster_cmd("tell", "mds.{0}".format(mds_id),
+                                                    "cache", "drop")
+        return json.loads(result)
+
+    def _setup(self, max_caps=20, threshold=400):
+        # create some files
+        self.mount_a.create_n_files("dc-dir/dc-file", 1000, sync=True)
+
+        # Reduce this so the MDS doesn't rkcall the maximum for simple tests
+        self.fs.rank_asok(['config', 'set', 'mds_recall_max_caps', str(max_caps)])
+        self.fs.rank_asok(['config', 'set', 'mds_recall_max_decay_threshold', str(threshold)])
+
+    def test_drop_cache_command(self):
+        """
+        Basic test for checking drop cache command.
+        Confirm it halts without a timeout.
+        Note that the cache size post trimming is not checked here.
+        """
+        mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
+        self._setup()
+        result = self._run_drop_cache_cmd()
+        self.assertEqual(result['client_recall']['return_code'], 0)
+        self.assertEqual(result['flush_journal']['return_code'], 0)
+        # It should take at least 1 second
+        self.assertGreater(result['duration'], 1)
+        self.assertGreaterEqual(result['trim_cache']['trimmed'], 1000-2*mds_min_caps_per_client)
+
+    def test_drop_cache_command_timeout(self):
+        """
+        Basic test for checking drop cache command.
+        Confirm recall halts early via a timeout.
+        Note that the cache size post trimming is not checked here.
+        """
+        self._setup()
+        result = self._run_drop_cache_cmd(timeout=10)
+        self.assertEqual(result['client_recall']['return_code'], -errno.ETIMEDOUT)
+        self.assertEqual(result['flush_journal']['return_code'], 0)
+        self.assertGreater(result['duration'], 10)
+        self.assertGreaterEqual(result['trim_cache']['trimmed'], 100) # we did something, right?
+
+    def test_drop_cache_command_dead_timeout(self):
+        """
+        Check drop cache command with non-responding client using tell
+        interface. Note that the cache size post trimming is not checked
+        here.
+        """
+        self._setup()
+        self.mount_a.kill()
+        # Note: recall is subject to the timeout. The journal flush will
+        # be delayed due to the client being dead.
+        result = self._run_drop_cache_cmd(timeout=5)
+        self.assertEqual(result['client_recall']['return_code'], -errno.ETIMEDOUT)
+        self.assertEqual(result['flush_journal']['return_code'], 0)
+        self.assertGreater(result['duration'], 5)
+        self.assertLess(result['duration'], 120)
+        # Note: result['trim_cache']['trimmed'] may be >0 because dropping the
+        # cache now causes the Locker to drive eviction of stale clients (a
+        # stale session will be autoclosed at mdsmap['session_timeout']). The
+        # particular operation causing this is journal flush which causes the
+        # MDS to wait wait for cap revoke.
+        #self.assertEqual(0, result['trim_cache']['trimmed'])
+        self.mount_a.kill_cleanup()
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+    def test_drop_cache_command_dead(self):
+        """
+        Check drop cache command with non-responding client using tell
+        interface. Note that the cache size post trimming is not checked
+        here.
+        """
+        self._setup()
+        self.mount_a.kill()
+        result = self._run_drop_cache_cmd()
+        self.assertEqual(result['client_recall']['return_code'], 0)
+        self.assertEqual(result['flush_journal']['return_code'], 0)
+        self.assertGreater(result['duration'], 5)
+        self.assertLess(result['duration'], 120)
+        # Note: result['trim_cache']['trimmed'] may be >0 because dropping the
+        # cache now causes the Locker to drive eviction of stale clients (a
+        # stale session will be autoclosed at mdsmap['session_timeout']). The
+        # particular operation causing this is journal flush which causes the
+        # MDS to wait wait for cap revoke.
+        self.mount_a.kill_cleanup()
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
diff --git a/qa/tasks/cephfs/test_openfiletable.py b/qa/tasks/cephfs/test_openfiletable.py
new file mode 100644
index 00000000..36e212d7
--- /dev/null
+++ b/qa/tasks/cephfs/test_openfiletable.py
@@ -0,0 +1,41 @@
+import time
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.exceptions import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+
+class OpenFileTable(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 1
+
+    def test_max_items_per_obj(self):
+        """
+        The maximum number of openfiles omap objects keys are now equal to
+        osd_deep_scrub_large_omap_object_key_threshold option.
+        """
+        self.set_conf("mds", "osd_deep_scrub_large_omap_object_key_threshold", "5")
+
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+
+        # Write some bytes to a file
+        size_mb = 1
+
+        # Hold the file open
+        file_count = 8
+        for i in range(0, file_count):
+            filename = "open_file{}".format(i)
+            p = self.mount_a.open_background(filename)
+            self.mount_a.write_n_mb(filename, size_mb)
+
+        time.sleep(10)
+
+        """
+        With osd_deep_scrub_large_omap_object_key_threshold value as 5 and
+        opening 8 files we should have a new rados object with name
+        mds0_openfiles.1 to hold the extra keys.
+        """
+
+        stat_out = self.fs.rados(["stat", "mds0_openfiles.1"])
+
+        # Now close the file
+        self.mount_a.kill_background(p)
diff --git a/qa/tasks/cephfs/test_pool_perm.py b/qa/tasks/cephfs/test_pool_perm.py
new file mode 100644
index 00000000..a1f234a2
--- /dev/null
+++ b/qa/tasks/cephfs/test_pool_perm.py
@@ -0,0 +1,113 @@
+from textwrap import dedent
+from teuthology.exceptions import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+import os
+
+
+class TestPoolPerm(CephFSTestCase):
+    def test_pool_perm(self):
+        self.mount_a.run_shell(["touch", "test_file"])
+
+        file_path = os.path.join(self.mount_a.mountpoint, "test_file")
+
+        remote_script = dedent("""
+            import os
+            import errno
+
+            fd = os.open("{path}", os.O_RDWR)
+            try:
+                if {check_read}:
+                    ret = os.read(fd, 1024)
+                else:
+                    os.write(fd, b'content')
+            except OSError as e:
+                if e.errno != errno.EPERM:
+                    raise
+            else:
+                raise RuntimeError("client does not check permission of data pool")
+            """)
+
+        client_name = "client.{0}".format(self.mount_a.client_id)
+
+        # set data pool read only
+        self.fs.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', client_name, 'mds', 'allow', 'mon', 'allow r', 'osd',
+            'allow r pool={0}'.format(self.fs.get_data_pool_name()))
+
+        self.mount_a.umount_wait()
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        # write should fail
+        self.mount_a.run_python(remote_script.format(path=file_path, check_read=str(False)))
+
+        # set data pool write only
+        self.fs.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', client_name, 'mds', 'allow', 'mon', 'allow r', 'osd',
+            'allow w pool={0}'.format(self.fs.get_data_pool_name()))
+
+        self.mount_a.umount_wait()
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        # read should fail
+        self.mount_a.run_python(remote_script.format(path=file_path, check_read=str(True)))
+
+    def test_forbidden_modification(self):
+        """
+        That a client who does not have the capability for setting
+        layout pools is prevented from doing so.
+        """
+
+        # Set up
+        client_name = "client.{0}".format(self.mount_a.client_id)
+        new_pool_name = "data_new"
+        self.fs.add_data_pool(new_pool_name)
+
+        self.mount_a.run_shell(["touch", "layoutfile"])
+        self.mount_a.run_shell(["mkdir", "layoutdir"])
+
+        # Set MDS 'rw' perms: missing 'p' means no setting pool layouts
+        self.fs.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', client_name, 'mds', 'allow rw', 'mon', 'allow r',
+            'osd',
+            'allow rw pool={0},allow rw pool={1}'.format(
+                self.fs.get_data_pool_names()[0],
+                self.fs.get_data_pool_names()[1],
+            ))
+
+        self.mount_a.umount_wait()
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        with self.assertRaises(CommandFailedError):
+            self.mount_a.setfattr("layoutfile", "ceph.file.layout.pool",
+                                  new_pool_name)
+        with self.assertRaises(CommandFailedError):
+            self.mount_a.setfattr("layoutdir", "ceph.dir.layout.pool",
+                                  new_pool_name)
+        self.mount_a.umount_wait()
+
+        # Set MDS 'rwp' perms: should now be able to set layouts
+        self.fs.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', client_name, 'mds', 'allow rwp', 'mon', 'allow r',
+            'osd',
+            'allow rw pool={0},allow rw pool={1}'.format(
+                self.fs.get_data_pool_names()[0],
+                self.fs.get_data_pool_names()[1],
+            ))
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        self.mount_a.setfattr("layoutfile", "ceph.file.layout.pool",
+                              new_pool_name)
+        self.mount_a.setfattr("layoutdir", "ceph.dir.layout.pool",
+                              new_pool_name)
+        self.mount_a.umount_wait()
+
+    def tearDown(self):
+        self.fs.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', "client.{0}".format(self.mount_a.client_id),
+            'mds', 'allow', 'mon', 'allow r', 'osd',
+            'allow rw pool={0}'.format(self.fs.get_data_pool_names()[0]))
+        super(TestPoolPerm, self).tearDown()
+
diff --git a/qa/tasks/cephfs/test_quota.py b/qa/tasks/cephfs/test_quota.py
new file mode 100644
index 00000000..dcfda5e2
--- /dev/null
+++ b/qa/tasks/cephfs/test_quota.py
@@ -0,0 +1,106 @@
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+from teuthology.exceptions import CommandFailedError
+
+class TestQuota(CephFSTestCase):
+    CLIENTS_REQUIRED = 2
+    MDSS_REQUIRED = 1
+
+    def test_remote_update_getfattr(self):
+        """
+        That quota changes made from one client are visible to another
+        client looking at ceph.quota xattrs
+        """
+        self.mount_a.run_shell(["mkdir", "subdir"])
+
+        self.assertEqual(
+            self.mount_a.getfattr("./subdir", "ceph.quota.max_files"),
+            None)
+        self.assertEqual(
+            self.mount_b.getfattr("./subdir", "ceph.quota.max_files"),
+            None)
+
+        self.mount_a.setfattr("./subdir", "ceph.quota.max_files", "10")
+        self.assertEqual(
+            self.mount_a.getfattr("./subdir", "ceph.quota.max_files"),
+            "10")
+
+        # Should be visible as soon as setxattr operation completes on
+        # mds (we get here sooner because setfattr gets an early reply)
+        self.wait_until_equal(
+            lambda: self.mount_b.getfattr("./subdir", "ceph.quota.max_files"),
+            "10", timeout=10)
+
+    def test_remote_update_df(self):
+        """
+        That when a client modifies the quota on a directory used
+        as another client's root, the other client sees the change
+        reflected in their statfs output.
+        """
+
+        self.mount_b.umount_wait()
+
+        self.mount_a.run_shell(["mkdir", "subdir"])
+
+        size_before = 1024 * 1024 * 128
+        self.mount_a.setfattr("./subdir", "ceph.quota.max_bytes",
+                              "%s" % size_before)
+
+        self.mount_b.mount(mount_path="/subdir")
+
+        self.assertDictEqual(
+            self.mount_b.df(),
+            {
+                "total": size_before,
+                "used": 0,
+                "available": size_before
+            })
+
+        size_after = 1024 * 1024 * 256
+        self.mount_a.setfattr("./subdir", "ceph.quota.max_bytes",
+                              "%s" % size_after)
+
+        # Should be visible as soon as setxattr operation completes on
+        # mds (we get here sooner because setfattr gets an early reply)
+        self.wait_until_equal(
+            lambda: self.mount_b.df(),
+            {
+                "total": size_after,
+                "used": 0,
+                "available": size_after
+            },
+            timeout=10
+        )
+
+    def test_remote_update_write(self):
+        """
+        That when a client modifies the quota on a directory used
+        as another client's root, the other client sees the effect
+        of the change when writing data.
+        """
+
+        self.mount_a.run_shell(["mkdir", "subdir_files"])
+        self.mount_a.run_shell(["mkdir", "subdir_data"])
+
+        # Set some nice high quotas that mount_b's initial operations
+        # will be well within
+        self.mount_a.setfattr("./subdir_files", "ceph.quota.max_files", "100")
+        self.mount_a.setfattr("./subdir_data", "ceph.quota.max_bytes", "104857600")
+
+        # Do some writes within my quota
+        self.mount_b.create_n_files("subdir_files/file", 20)
+        self.mount_b.write_n_mb("subdir_data/file", 20)
+
+        # Set quotas lower than what mount_b already wrote, it should
+        # refuse to write more once it's seen them
+        self.mount_a.setfattr("./subdir_files", "ceph.quota.max_files", "10")
+        self.mount_a.setfattr("./subdir_data", "ceph.quota.max_bytes", "1048576")
+
+        # Do some writes that would have been okay within the old quota,
+        # but are forbidden under the new quota
+        with self.assertRaises(CommandFailedError):
+            self.mount_b.create_n_files("subdir_files/file", 40)
+        with self.assertRaises(CommandFailedError):
+            self.mount_b.write_n_mb("subdir_data/file", 40)
+
diff --git a/qa/tasks/cephfs/test_readahead.py b/qa/tasks/cephfs/test_readahead.py
new file mode 100644
index 00000000..31e7bf18
--- /dev/null
+++ b/qa/tasks/cephfs/test_readahead.py
@@ -0,0 +1,31 @@
+import logging
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+
+class TestReadahead(CephFSTestCase):
+    def test_flush(self):
+        if not isinstance(self.mount_a, FuseMount):
+            self.skipTest("FUSE needed for measuring op counts")
+
+        # Create 32MB file
+        self.mount_a.run_shell(["dd", "if=/dev/urandom", "of=foo", "bs=1M", "count=32"])
+
+        # Unmount and remount the client to flush cache
+        self.mount_a.umount_wait()
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        initial_op_r = self.mount_a.admin_socket(['perf', 'dump', 'objecter'])['objecter']['op_r']
+        self.mount_a.run_shell(["dd", "if=foo", "of=/dev/null", "bs=128k", "count=32"])
+        op_r = self.mount_a.admin_socket(['perf', 'dump', 'objecter'])['objecter']['op_r']
+        assert op_r >= initial_op_r
+        op_r -= initial_op_r
+        log.info("read operations: {0}".format(op_r))
+
+        # with exponentially increasing readahead, we should see fewer than 10 operations
+        # but this test simply checks if the client is doing a remote read for each local read
+        if op_r >= 32:
+            raise RuntimeError("readahead not working")
diff --git a/qa/tasks/cephfs/test_recovery_pool.py b/qa/tasks/cephfs/test_recovery_pool.py
new file mode 100644
index 00000000..36b4e58e
--- /dev/null
+++ b/qa/tasks/cephfs/test_recovery_pool.py
@@ -0,0 +1,207 @@
+"""
+Test our tools for recovering metadata from the data pool into an alternate pool
+"""
+
+import logging
+import traceback
+from collections import namedtuple
+
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+
+ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
+
+
+class OverlayWorkload(object):
+    def __init__(self, orig_fs, recovery_fs, orig_mount, recovery_mount):
+        self._orig_fs = orig_fs
+        self._recovery_fs = recovery_fs
+        self._orig_mount = orig_mount
+        self._recovery_mount = recovery_mount
+        self._initial_state = None
+
+        # Accumulate backtraces for every failed validation, and return them.  Backtraces
+        # are rather verbose, but we only see them when something breaks, and they
+        # let us see which check failed without having to decorate each check with
+        # a string
+        self._errors = []
+
+    def assert_equal(self, a, b):
+        try:
+            if a != b:
+                raise AssertionError("{0} != {1}".format(a, b))
+        except AssertionError as e:
+            self._errors.append(
+                ValidationError(e, traceback.format_exc(3))
+            )
+
+    def write(self):
+        """
+        Write the workload files to the mount
+        """
+        raise NotImplementedError()
+
+    def validate(self):
+        """
+        Read from the mount and validate that the workload files are present (i.e. have
+        survived or been reconstructed from the test scenario)
+        """
+        raise NotImplementedError()
+
+    def damage(self):
+        """
+        Damage the filesystem pools in ways that will be interesting to recover from.  By
+        default just wipe everything in the metadata pool
+        """
+        # Delete every object in the metadata pool
+        objects = self._orig_fs.rados(["ls"]).split("\n")
+        for o in objects:
+            self._orig_fs.rados(["rm", o])
+
+    def flush(self):
+        """
+        Called after client unmount, after write: flush whatever you want
+        """
+        self._orig_fs.mds_asok(["flush", "journal"])
+        self._recovery_fs.mds_asok(["flush", "journal"])
+
+
+class SimpleOverlayWorkload(OverlayWorkload):
+    """
+    Single file, single directory, check that it gets recovered and so does its size
+    """
+    def write(self):
+        self._orig_mount.run_shell(["mkdir", "subdir"])
+        self._orig_mount.write_n_mb("subdir/sixmegs", 6)
+        self._initial_state = self._orig_mount.stat("subdir/sixmegs")
+
+    def validate(self):
+        self._recovery_mount.run_shell(["ls", "subdir"])
+        st = self._recovery_mount.stat("subdir/sixmegs")
+        self.assert_equal(st['st_size'], self._initial_state['st_size'])
+        return self._errors
+
+class TestRecoveryPool(CephFSTestCase):
+    MDSS_REQUIRED = 2
+    CLIENTS_REQUIRED = 2
+    REQUIRE_RECOVERY_FILESYSTEM = True
+
+    def is_marked_damaged(self, rank):
+        mds_map = self.fs.get_mds_map()
+        return rank in mds_map['damaged']
+
+    def _rebuild_metadata(self, workload, other_pool=None, workers=1):
+        """
+        That when all objects in metadata pool are removed, we can rebuild a metadata pool
+        based on the contents of a data pool, and a client can see and read our files.
+        """
+
+        # First, inject some files
+
+        workload.write()
+
+        # Unmount the client and flush the journal: the tool should also cope with
+        # situations where there is dirty metadata, but we'll test that separately
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+        workload.flush()
+
+        # Create the alternate pool if requested
+        recovery_fs = self.recovery_fs.name
+        recovery_pool = self.recovery_fs.get_metadata_pool_name()
+        self.recovery_fs.data_scan(['init', '--force-init',
+                                    '--filesystem', recovery_fs,
+                                    '--alternate-pool', recovery_pool])
+        self.recovery_fs.mon_manager.raw_cluster_cmd('-s')
+        self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "session"])
+        self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "snap"])
+        self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "inode"])
+
+        # Stop the MDS
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # After recovery, we need the MDS to not be strict about stats (in production these options
+        # are off by default, but in QA we need to explicitly disable them)
+        self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
+        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
+
+        # Apply any data damage the workload wants
+        workload.damage()
+
+        # Reset the MDS map in case multiple ranks were in play: recovery procedure
+        # only understands how to rebuild metadata under rank 0
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
+                '--yes-i-really-mean-it')
+
+        self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
+        self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
+        self.fs.table_tool([self.fs.name + ":0", "reset", "inode"])
+
+        # Run the recovery procedure
+        if False:
+            with self.assertRaises(CommandFailedError):
+                # Normal reset should fail when no objects are present, we'll use --force instead
+                self.fs.journal_tool(["journal", "reset"], 0)
+
+        self.fs.mds_stop()
+        self.fs.data_scan(['scan_extents', '--alternate-pool',
+                           recovery_pool, '--filesystem', self.fs.name,
+                           self.fs.get_data_pool_name()])
+        self.fs.data_scan(['scan_inodes', '--alternate-pool',
+                           recovery_pool, '--filesystem', self.fs.name,
+                           '--force-corrupt', '--force-init',
+                           self.fs.get_data_pool_name()])
+        self.fs.journal_tool(['event', 'recover_dentries', 'list',
+                              '--alternate-pool', recovery_pool], 0)
+
+        self.fs.data_scan(['init', '--force-init', '--filesystem',
+                           self.fs.name])
+        self.fs.data_scan(['scan_inodes', '--filesystem', self.fs.name,
+                           '--force-corrupt', '--force-init',
+                           self.fs.get_data_pool_name()])
+        self.fs.journal_tool(['event', 'recover_dentries', 'list'], 0)
+
+        self.recovery_fs.journal_tool(['journal', 'reset', '--force'], 0)
+        self.fs.journal_tool(['journal', 'reset', '--force'], 0)
+        self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired',
+                                            recovery_fs + ":0")
+
+        # Mark the MDS repaired
+        self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
+
+        # Start the MDS
+        self.fs.mds_restart()
+        self.recovery_fs.mds_restart()
+        self.fs.wait_for_daemons()
+        self.recovery_fs.wait_for_daemons()
+        status = self.recovery_fs.status()
+        for rank in self.recovery_fs.get_ranks(status=status):
+            self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + rank['name'],
+                                                'injectargs', '--debug-mds=20')
+            self.fs.rank_tell(['scrub', 'start', '/', 'recursive', 'repair'], rank=rank['rank'], status=status)
+        log.info(str(self.mds_cluster.status()))
+
+        # Mount a client
+        self.mount_a.mount()
+        self.mount_b.mount(mount_fs_name=recovery_fs)
+        self.mount_a.wait_until_mounted()
+        self.mount_b.wait_until_mounted()
+
+        # See that the files are present and correct
+        errors = workload.validate()
+        if errors:
+            log.error("Validation errors found: {0}".format(len(errors)))
+            for e in errors:
+                log.error(e.exception)
+                log.error(e.backtrace)
+            raise AssertionError("Validation failed, first error: {0}\n{1}".format(
+                errors[0].exception, errors[0].backtrace
+            ))
+
+    def test_rebuild_simple(self):
+        self._rebuild_metadata(SimpleOverlayWorkload(self.fs, self.recovery_fs,
+                                                     self.mount_a, self.mount_b))
diff --git a/qa/tasks/cephfs/test_scrub.py b/qa/tasks/cephfs/test_scrub.py
new file mode 100644
index 00000000..1875b5f3
--- /dev/null
+++ b/qa/tasks/cephfs/test_scrub.py
@@ -0,0 +1,175 @@
+"""
+Test CephFS scrub (distinct from OSD scrub) functionality
+"""
+import logging
+from collections import namedtuple
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
+
+
+class Workload(CephFSTestCase):
+    def __init__(self, filesystem, mount):
+        super().__init__()
+        self._mount = mount
+        self._filesystem = filesystem
+        self._initial_state = None
+
+        # Accumulate backtraces for every failed validation, and return them.  Backtraces
+        # are rather verbose, but we only see them when something breaks, and they
+        # let us see which check failed without having to decorate each check with
+        # a string
+        self._errors = []
+
+    def write(self):
+        """
+        Write the workload files to the mount
+        """
+        raise NotImplementedError()
+
+    def validate(self):
+        """
+        Read from the mount and validate that the workload files are present (i.e. have
+        survived or been reconstructed from the test scenario)
+        """
+        raise NotImplementedError()
+
+    def damage(self):
+        """
+        Damage the filesystem pools in ways that will be interesting to recover from.  By
+        default just wipe everything in the metadata pool
+        """
+        # Delete every object in the metadata pool
+        objects = self._filesystem.rados(["ls"]).split("\n")
+        for o in objects:
+            self._filesystem.rados(["rm", o])
+
+    def flush(self):
+        """
+        Called after client unmount, after write: flush whatever you want
+        """
+        self._filesystem.mds_asok(["flush", "journal"])
+
+
+class BacktraceWorkload(Workload):
+    """
+    Single file, single directory, wipe the backtrace and check it.
+    """
+    def write(self):
+        self._mount.run_shell(["mkdir", "subdir"])
+        self._mount.write_n_mb("subdir/sixmegs", 6)
+
+    def validate(self):
+        st = self._mount.stat("subdir/sixmegs")
+        self._filesystem.mds_asok(["flush", "journal"])
+        bt = self._filesystem.read_backtrace(st['st_ino'])
+        parent = bt['ancestors'][0]['dname']
+        self.assertEqual(parent, 'sixmegs')
+        return self._errors
+
+    def damage(self):
+        st = self._mount.stat("subdir/sixmegs")
+        self._filesystem.mds_asok(["flush", "journal"])
+        self._filesystem._write_data_xattr(st['st_ino'], "parent", "")
+
+    def create_files(self, nfiles=1000):
+        self._mount.create_n_files("scrub-new-files/file", nfiles)
+
+
+class DupInodeWorkload(Workload):
+    """
+    Duplicate an inode and try scrubbing it twice."
+    """
+
+    def write(self):
+        self._mount.run_shell(["mkdir", "parent"])
+        self._mount.run_shell(["mkdir", "parent/child"])
+        self._mount.write_n_mb("parent/parentfile", 6)
+        self._mount.write_n_mb("parent/child/childfile", 6)
+
+    def damage(self):
+        temp_bin_path = "/tmp/10000000000.00000000_omap.bin"
+        self._mount.umount_wait()
+        self._filesystem.mds_asok(["flush", "journal"])
+        self._filesystem.mds_stop()
+        self._filesystem.rados(["getomapval", "10000000000.00000000",
+                                "parentfile_head", temp_bin_path])
+        self._filesystem.rados(["setomapval", "10000000000.00000000",
+                                "shadow_head"], stdin_file=temp_bin_path)
+        self._filesystem.set_ceph_conf('mds', 'mds hack allow loading invalid metadata', True)
+        self._filesystem.mds_restart()
+        self._filesystem.wait_for_daemons()
+
+    def validate(self):
+        out_json = self._filesystem.rank_tell(["scrub", "start", "/", "recursive", "repair"])
+        self.assertNotEqual(out_json, None)
+        self.assertTrue(self._filesystem.are_daemons_healthy())
+        return self._errors
+
+
+class TestScrub(CephFSTestCase):
+    MDSS_REQUIRED = 1
+
+    def setUp(self):
+        super().setUp()
+
+    def _scrub(self, workload, workers=1):
+        """
+        That when all objects in metadata pool are removed, we can rebuild a metadata pool
+        based on the contents of a data pool, and a client can see and read our files.
+        """
+
+        # First, inject some files
+
+        workload.write()
+
+        # are off by default, but in QA we need to explicitly disable them)
+        self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
+        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
+
+        # Apply any data damage the workload wants
+        workload.damage()
+
+        out_json = self.fs.rank_tell(["scrub", "start", "/", "recursive", "repair"])
+        self.assertNotEqual(out_json, None)
+
+        # See that the files are present and correct
+        errors = workload.validate()
+        if errors:
+            log.error("Validation errors found: {0}".format(len(errors)))
+            for e in errors:
+                log.error(e.exception)
+                log.error(e.backtrace)
+            raise AssertionError("Validation failed, first error: {0}\n{1}".format(
+                errors[0].exception, errors[0].backtrace
+            ))
+
+    def _get_damage_count(self, damage_type='backtrace'):
+        out_json = self.fs.rank_tell(["damage", "ls"])
+        self.assertNotEqual(out_json, None)
+
+        damage_count = 0
+        for it in out_json:
+            if it['damage_type'] == damage_type:
+                damage_count += 1
+        return damage_count
+
+    def _scrub_new_files(self, workload):
+        """
+        That scrubbing new files does not lead to errors
+        """
+        workload.create_files(1000)
+        self._wait_until_scrub_complete()
+        self.assertEqual(self._get_damage_count(), 0)
+
+    def test_scrub_backtrace_for_new_files(self):
+        self._scrub_new_files(BacktraceWorkload(self.fs, self.mount_a))
+
+    def test_scrub_backtrace(self):
+        self._scrub(BacktraceWorkload(self.fs, self.mount_a))
+
+    def test_scrub_dup_inode(self):
+        self._scrub(DupInodeWorkload(self.fs, self.mount_a))
diff --git a/qa/tasks/cephfs/test_scrub_checks.py b/qa/tasks/cephfs/test_scrub_checks.py
new file mode 100644
index 00000000..54ed16ff
--- /dev/null
+++ b/qa/tasks/cephfs/test_scrub_checks.py
@@ -0,0 +1,405 @@
+"""
+MDS admin socket scrubbing-related tests.
+"""
+import json
+import logging
+import errno
+import time
+from teuthology.exceptions import CommandFailedError
+import os
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+class TestScrubControls(CephFSTestCase):
+    """
+    Test basic scrub control operations such as abort, pause and resume.
+    """
+
+    MDSS_REQUIRED = 2
+    CLIENTS_REQUIRED = 1
+
+    def _abort_scrub(self, expected):
+        res = self.fs.rank_tell(["scrub", "abort"])
+        self.assertEqual(res['return_code'], expected)
+    def _pause_scrub(self, expected):
+        res = self.fs.rank_tell(["scrub", "pause"])
+        self.assertEqual(res['return_code'], expected)
+    def _resume_scrub(self, expected):
+        res = self.fs.rank_tell(["scrub", "resume"])
+        self.assertEqual(res['return_code'], expected)
+    def _get_scrub_status(self):
+        return self.fs.rank_tell(["scrub", "status"])
+    def _check_task_status(self, expected_status):
+        task_status = self.fs.get_task_status("scrub status")
+        active = self.fs.get_active_names()
+        log.debug("current active={0}".format(active))
+        self.assertTrue(task_status[active[0]].startswith(expected_status))
+
+    def test_scrub_abort(self):
+        test_dir = "scrub_control_test_path"
+        abs_test_path = "/{0}".format(test_dir)
+
+        log.info("mountpoint: {0}".format(self.mount_a.mountpoint))
+        client_path = os.path.join(self.mount_a.mountpoint, test_dir)
+        log.info("client_path: {0}".format(client_path))
+
+        log.info("Cloning repo into place")
+        TestScrubChecks.clone_repo(self.mount_a, client_path)
+
+        out_json = self.fs.rank_tell(["scrub", "start", abs_test_path, "recursive"])
+        self.assertNotEqual(out_json, None)
+
+        # abort and verify
+        self._abort_scrub(0)
+        out_json = self._get_scrub_status()
+        self.assertTrue("no active" in out_json['status'])
+
+        # sleep enough to fetch updated task status
+        time.sleep(10)
+        self._check_task_status("idle")
+
+    def test_scrub_pause_and_resume(self):
+        test_dir = "scrub_control_test_path"
+        abs_test_path = "/{0}".format(test_dir)
+
+        log.info("mountpoint: {0}".format(self.mount_a.mountpoint))
+        client_path = os.path.join(self.mount_a.mountpoint, test_dir)
+        log.info("client_path: {0}".format(client_path))
+
+        log.info("Cloning repo into place")
+        _ = TestScrubChecks.clone_repo(self.mount_a, client_path)
+
+        out_json = self.fs.rank_tell(["scrub", "start", abs_test_path, "recursive"])
+        self.assertNotEqual(out_json, None)
+
+        # pause and verify
+        self._pause_scrub(0)
+        out_json = self._get_scrub_status()
+        self.assertTrue("PAUSED" in out_json['status'])
+
+        # sleep enough to fetch updated task status
+        time.sleep(10)
+        self._check_task_status("paused")
+
+        # resume and verify
+        self._resume_scrub(0)
+        out_json = self._get_scrub_status()
+        self.assertFalse("PAUSED" in out_json['status'])
+
+    def test_scrub_pause_and_resume_with_abort(self):
+        test_dir = "scrub_control_test_path"
+        abs_test_path = "/{0}".format(test_dir)
+
+        log.info("mountpoint: {0}".format(self.mount_a.mountpoint))
+        client_path = os.path.join(self.mount_a.mountpoint, test_dir)
+        log.info("client_path: {0}".format(client_path))
+
+        log.info("Cloning repo into place")
+        _ = TestScrubChecks.clone_repo(self.mount_a, client_path)
+
+        out_json = self.fs.rank_tell(["scrub", "start", abs_test_path, "recursive"])
+        self.assertNotEqual(out_json, None)
+
+        # pause and verify
+        self._pause_scrub(0)
+        out_json = self._get_scrub_status()
+        self.assertTrue("PAUSED" in out_json['status'])
+
+        # sleep enough to fetch updated task status
+        time.sleep(10)
+        self._check_task_status("paused")
+
+        # abort and verify
+        self._abort_scrub(0)
+        out_json = self._get_scrub_status()
+        self.assertTrue("PAUSED" in out_json['status'])
+        self.assertTrue("0 inodes" in out_json['status'])
+
+        # sleep enough to fetch updated task status
+        time.sleep(10)
+        self._check_task_status("paused")
+
+        # resume and verify
+        self._resume_scrub(0)
+        out_json = self._get_scrub_status()
+        self.assertTrue("no active" in out_json['status'])
+
+        # sleep enough to fetch updated task status
+        time.sleep(10)
+        self._check_task_status("idle")
+
+    def test_scrub_task_status_on_mds_failover(self):
+        # sleep enough to fetch updated task status
+        time.sleep(10)
+
+        (original_active, ) = self.fs.get_active_names()
+        original_standbys = self.mds_cluster.get_standby_daemons()
+        self._check_task_status("idle")
+
+        # Kill the rank 0
+        self.fs.mds_stop(original_active)
+
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+        def promoted():
+            active = self.fs.get_active_names()
+            return active and active[0] in original_standbys
+
+        log.info("Waiting for promotion of one of the original standbys {0}".format(
+            original_standbys))
+        self.wait_until_true(promoted, timeout=grace*2)
+
+        mgr_beacon_grace = float(self.fs.get_config("mgr_service_beacon_grace", service_type="mon"))
+
+        def status_check():
+            task_status = self.fs.get_task_status("scrub status")
+            return original_active not in task_status
+        self.wait_until_true(status_check, timeout=mgr_beacon_grace*2)
+
+class TestScrubChecks(CephFSTestCase):
+    """
+    Run flush and scrub commands on the specified files in the filesystem. This
+    task will run through a sequence of operations, but it is not comprehensive
+    on its own -- it doesn't manipulate the mds cache state to test on both
+    in- and out-of-memory parts of the hierarchy. So it's designed to be run
+    multiple times within a single test run, so that the test can manipulate
+    memory state.
+
+    Usage:
+    mds_scrub_checks:
+      mds_rank: 0
+      path: path/to/test/dir
+      client: 0
+      run_seq: [0-9]+
+
+    Increment the run_seq on subsequent invocations within a single test run;
+    it uses that value to generate unique folder and file names.
+    """
+
+    MDSS_REQUIRED = 1
+    CLIENTS_REQUIRED = 1
+
+    def test_scrub_checks(self):
+        self._checks(0)
+        self._checks(1)
+
+    def _checks(self, run_seq):
+        mds_rank = 0
+        test_dir = "scrub_test_path"
+
+        abs_test_path = "/{0}".format(test_dir)
+
+        log.info("mountpoint: {0}".format(self.mount_a.mountpoint))
+        client_path = os.path.join(self.mount_a.mountpoint, test_dir)
+        log.info("client_path: {0}".format(client_path))
+
+        log.info("Cloning repo into place")
+        repo_path = TestScrubChecks.clone_repo(self.mount_a, client_path)
+
+        log.info("Initiating mds_scrub_checks on mds.{id_}, " +
+                 "test_path {path}, run_seq {seq}".format(
+                     id_=mds_rank, path=abs_test_path, seq=run_seq)
+                 )
+
+
+        success_validator = lambda j, r: self.json_validator(j, r, "return_code", 0)
+
+        nep = "{test_path}/i/dont/exist".format(test_path=abs_test_path)
+        self.asok_command(mds_rank, "flush_path {nep}".format(nep=nep),
+                          lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT))
+        self.tell_command(mds_rank, "scrub start {nep}".format(nep=nep),
+                          lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT))
+
+        test_repo_path = "{test_path}/ceph-qa-suite".format(test_path=abs_test_path)
+        dirpath = "{repo_path}/suites".format(repo_path=test_repo_path)
+
+        if run_seq == 0:
+            log.info("First run: flushing {dirpath}".format(dirpath=dirpath))
+            command = "flush_path {dirpath}".format(dirpath=dirpath)
+            self.asok_command(mds_rank, command, success_validator)
+        command = "scrub start {dirpath}".format(dirpath=dirpath)
+        self.tell_command(mds_rank, command, success_validator)
+
+        filepath = "{repo_path}/suites/fs/verify/validater/valgrind.yaml".format(
+            repo_path=test_repo_path)
+        if run_seq == 0:
+            log.info("First run: flushing {filepath}".format(filepath=filepath))
+            command = "flush_path {filepath}".format(filepath=filepath)
+            self.asok_command(mds_rank, command, success_validator)
+        command = "scrub start {filepath}".format(filepath=filepath)
+        self.tell_command(mds_rank, command, success_validator)
+
+        filepath = "{repo_path}/suites/fs/basic/clusters/fixed-3-cephfs.yaml". \
+            format(repo_path=test_repo_path)
+        command = "scrub start {filepath}".format(filepath=filepath)
+        self.tell_command(mds_rank, command,
+                          lambda j, r: self.json_validator(j, r, "performed_validation",
+                                                           False))
+
+        if run_seq == 0:
+            log.info("First run: flushing base dir /")
+            command = "flush_path /"
+            self.asok_command(mds_rank, command, success_validator)
+        command = "scrub start /"
+        self.tell_command(mds_rank, command, success_validator)
+
+        new_dir = "{repo_path}/new_dir_{i}".format(repo_path=repo_path, i=run_seq)
+        test_new_dir = "{repo_path}/new_dir_{i}".format(repo_path=test_repo_path,
+                                                        i=run_seq)
+        self.mount_a.run_shell(["mkdir", new_dir])
+        command = "flush_path {dir}".format(dir=test_new_dir)
+        self.asok_command(mds_rank, command, success_validator)
+
+        new_file = "{repo_path}/new_file_{i}".format(repo_path=repo_path,
+                                                     i=run_seq)
+        test_new_file = "{repo_path}/new_file_{i}".format(repo_path=test_repo_path,
+                                                          i=run_seq)
+        self.mount_a.write_n_mb(new_file, 1)
+
+        command = "flush_path {file}".format(file=test_new_file)
+        self.asok_command(mds_rank, command, success_validator)
+
+        # check that scrub fails on errors
+        ino = self.mount_a.path_to_ino(new_file)
+        rados_obj_name = "{ino:x}.00000000".format(ino=ino)
+        command = "scrub start {file}".format(file=test_new_file)
+
+        # Missing parent xattr -> ENODATA
+        self.fs.rados(["rmxattr", rados_obj_name, "parent"], pool=self.fs.get_data_pool_name())
+        self.tell_command(mds_rank, command,
+                          lambda j, r: self.json_validator(j, r, "return_code", -errno.ENODATA))
+
+        # Missing object -> ENOENT
+        self.fs.rados(["rm", rados_obj_name], pool=self.fs.get_data_pool_name())
+        self.tell_command(mds_rank, command,
+                          lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT))
+
+        command = "flush_path /"
+        self.asok_command(mds_rank, command, success_validator)
+
+    def test_scrub_repair(self):
+        mds_rank = 0
+        test_dir = "scrub_repair_path"
+
+        self.mount_a.run_shell(["sudo", "mkdir", test_dir])
+        self.mount_a.run_shell(["sudo", "touch", "{0}/file".format(test_dir)])
+        dir_objname = "{:x}.00000000".format(self.mount_a.path_to_ino(test_dir))
+
+        self.mount_a.umount_wait()
+
+        # flush journal entries to dirfrag objects, and expire journal
+        self.fs.mds_asok(['flush', 'journal'])
+        self.fs.mds_stop()
+
+        # remove the dentry from dirfrag, cause incorrect fragstat/rstat
+        self.fs.rados(["rmomapkey", dir_objname, "file_head"],
+                      pool=self.fs.get_metadata_pool_name())
+
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        # fragstat indicates the directory is not empty, rmdir should fail
+        with self.assertRaises(CommandFailedError) as ar:
+            self.mount_a.run_shell(["sudo", "rmdir", test_dir])
+        self.assertEqual(ar.exception.exitstatus, 1)
+
+        self.tell_command(mds_rank, "scrub start /{0} repair".format(test_dir),
+                          lambda j, r: self.json_validator(j, r, "return_code", 0))
+
+        # wait a few second for background repair
+        time.sleep(10)
+
+        # fragstat should be fixed
+        self.mount_a.run_shell(["sudo", "rmdir", test_dir])
+
+    @staticmethod
+    def json_validator(json_out, rc, element, expected_value):
+        if rc != 0:
+            return False, "asok command returned error {rc}".format(rc=rc)
+        element_value = json_out.get(element)
+        if element_value != expected_value:
+            return False, "unexpectedly got {jv} instead of {ev}!".format(
+                jv=element_value, ev=expected_value)
+        return True, "Succeeded"
+
+    def tell_command(self, mds_rank, command, validator):
+        log.info("Running command '{command}'".format(command=command))
+
+        command_list = command.split()
+        jout = self.fs.rank_tell(command_list, mds_rank)
+
+        log.info("command '{command}' returned '{jout}'".format(
+                     command=command, jout=jout))
+
+        success, errstring = validator(jout, 0)
+        if not success:
+            raise AsokCommandFailedError(command, 0, jout, errstring)
+        return jout
+
+    def asok_command(self, mds_rank, command, validator):
+        log.info("Running command '{command}'".format(command=command))
+
+        command_list = command.split()
+
+        # we just assume there's an active mds for every rank
+        mds_id = self.fs.get_active_names()[mds_rank]
+        proc = self.fs.mon_manager.admin_socket('mds', mds_id,
+                                                command_list, check_status=False)
+        rout = proc.exitstatus
+        sout = proc.stdout.getvalue()
+
+        if sout.strip():
+            jout = json.loads(sout)
+        else:
+            jout = None
+
+        log.info("command '{command}' got response code " +
+                 "'{rout}' and stdout '{sout}'".format(
+                     command=command, rout=rout, sout=sout))
+
+        success, errstring = validator(jout, rout)
+
+        if not success:
+            raise AsokCommandFailedError(command, rout, jout, errstring)
+
+        return jout
+
+    @staticmethod
+    def clone_repo(client_mount, path):
+        repo = "ceph-qa-suite"
+        repo_path = os.path.join(path, repo)
+        client_mount.run_shell(["mkdir", "-p", path])
+
+        try:
+            client_mount.stat(repo_path)
+        except CommandFailedError:
+            client_mount.run_shell([
+                "git", "clone", '--branch', 'giant',
+                "http://github.com/ceph/{repo}".format(repo=repo),
+                "{path}/{repo}".format(path=path, repo=repo)
+            ])
+
+        return repo_path
+
+
+class AsokCommandFailedError(Exception):
+    """
+    Exception thrown when we get an unexpected response
+    on an admin socket command
+    """
+
+    def __init__(self, command, rc, json_out, errstring):
+        self.command = command
+        self.rc = rc
+        self.json = json_out
+        self.errstring = errstring
+
+    def __str__(self):
+        return "Admin socket: {command} failed with rc={rc}," + \
+               "json output={json}, because '{es}'".format(
+                   command=self.command, rc=self.rc,
+                   json=self.json, es=self.errstring)
diff --git a/qa/tasks/cephfs/test_sessionmap.py b/qa/tasks/cephfs/test_sessionmap.py
new file mode 100644
index 00000000..b44991f5
--- /dev/null
+++ b/qa/tasks/cephfs/test_sessionmap.py
@@ -0,0 +1,236 @@
+import time
+import json
+import logging
+from unittest import SkipTest
+
+from tasks.cephfs.fuse_mount import FuseMount
+from teuthology.exceptions import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.misc import sudo_write_file
+
+log = logging.getLogger(__name__)
+
+
+class TestSessionMap(CephFSTestCase):
+    CLIENTS_REQUIRED = 2
+    MDSS_REQUIRED = 2
+
+    def test_tell_session_drop(self):
+        """
+        That when a `tell` command is sent using the python CLI,
+        its MDS session is gone after it terminates
+        """
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        status = self.fs.status()
+        self.fs.rank_tell(["session", "ls"], status=status)
+
+        ls_data = self.fs.rank_asok(['session', 'ls'], status=status)
+        self.assertEqual(len(ls_data), 0)
+
+    def _get_connection_count(self, status=None):
+        perf = self.fs.rank_asok(["perf", "dump"], status=status)
+        conn = 0
+        for module, dump in perf.items():
+            if "AsyncMessenger::Worker" in module:
+                conn += dump['msgr_active_connections']
+        return conn
+
+    def test_tell_conn_close(self):
+        """
+        That when a `tell` command is sent using the python CLI,
+        the conn count goes back to where it started (i.e. we aren't
+        leaving connections open)
+        """
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        status = self.fs.status()
+        s = self._get_connection_count(status=status)
+        self.fs.rank_tell(["session", "ls"], status=status)
+        e = self._get_connection_count(status=status)
+
+        self.assertEqual(s, e)
+
+    def test_mount_conn_close(self):
+        """
+        That when a client unmounts, the thread count on the MDS goes back
+        to what it was before the client mounted
+        """
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        status = self.fs.status()
+        s = self._get_connection_count(status=status)
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        self.assertGreater(self._get_connection_count(status=status), s)
+        self.mount_a.umount_wait()
+        e = self._get_connection_count(status=status)
+
+        self.assertEqual(s, e)
+
+    def test_version_splitting(self):
+        """
+        That when many sessions are updated, they are correctly
+        split into multiple versions to obey mds_sessionmap_keys_per_op
+        """
+
+        # Start umounted
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        # Configure MDS to write one OMAP key at once
+        self.set_conf('mds', 'mds_sessionmap_keys_per_op', 1)
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        # I would like two MDSs, so that I can do an export dir later
+        self.fs.set_max_mds(2)
+        self.fs.wait_for_daemons()
+
+        status = self.fs.status()
+
+        # Bring the clients back
+        self.mount_a.mount()
+        self.mount_b.mount()
+        self.mount_a.create_files()  # Kick the client into opening sessions
+        self.mount_b.create_files()
+
+        # See that they've got sessions
+        self.assert_session_count(2, mds_id=self.fs.get_rank(status=status)['name'])
+
+        # See that we persist their sessions
+        self.fs.rank_asok(["flush", "journal"], rank=0, status=status)
+        table_json = json.loads(self.fs.table_tool(["0", "show", "session"]))
+        log.info("SessionMap: {0}".format(json.dumps(table_json, indent=2)))
+        self.assertEqual(table_json['0']['result'], 0)
+        self.assertEqual(len(table_json['0']['data']['sessions']), 2)
+
+        # Now, induce a "force_open_sessions" event by exporting a dir
+        self.mount_a.run_shell(["mkdir", "bravo"])
+        self.mount_a.run_shell(["touch", "bravo/file"])
+        self.mount_b.run_shell(["ls", "-l", "bravo/file"])
+
+        def get_omap_wrs():
+            return self.fs.rank_asok(['perf', 'dump', 'objecter'], rank=1, status=status)['objecter']['omap_wr']
+
+        # Flush so that there are no dirty sessions on rank 1
+        self.fs.rank_asok(["flush", "journal"], rank=1, status=status)
+
+        # Export so that we get a force_open to rank 1 for the two sessions from rank 0
+        initial_omap_wrs = get_omap_wrs()
+        self.fs.rank_asok(['export', 'dir', '/bravo', '1'], rank=0, status=status)
+
+        # This is the critical (if rather subtle) check: that in the process of doing an export dir,
+        # we hit force_open_sessions, and as a result we end up writing out the sessionmap.  There
+        # will be two sessions dirtied here, and because we have set keys_per_op to 1, we should see
+        # a single session get written out (the first of the two, triggered by the second getting marked
+        # dirty)
+        # The number of writes is two per session, because the header (sessionmap version) update and
+        # KV write both count. Also, multiply by 2 for each openfile table update.
+        self.wait_until_true(
+            lambda: get_omap_wrs() - initial_omap_wrs == 2*2,
+            timeout=30  # Long enough for an export to get acked
+        )
+
+        # Now end our sessions and check the backing sessionmap is updated correctly
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        # In-memory sessionmap check
+        self.assert_session_count(0, mds_id=self.fs.get_rank(status=status)['name'])
+
+        # On-disk sessionmap check
+        self.fs.rank_asok(["flush", "journal"], rank=0, status=status)
+        table_json = json.loads(self.fs.table_tool(["0", "show", "session"]))
+        log.info("SessionMap: {0}".format(json.dumps(table_json, indent=2)))
+        self.assertEqual(table_json['0']['result'], 0)
+        self.assertEqual(len(table_json['0']['data']['sessions']), 0)
+
+    def _configure_auth(self, mount, id_name, mds_caps, osd_caps=None, mon_caps=None):
+        """
+        Set up auth credentials for a client mount, and write out the keyring
+        for the client to use.
+        """
+
+        if osd_caps is None:
+            osd_caps = "allow rw"
+
+        if mon_caps is None:
+            mon_caps = "allow r"
+
+        out = self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "get-or-create", "client.{name}".format(name=id_name),
+            "mds", mds_caps,
+            "osd", osd_caps,
+            "mon", mon_caps
+        )
+        mount.client_id = id_name
+        sudo_write_file(mount.client_remote, mount.get_keyring_path(), out)
+        self.set_conf("client.{name}".format(name=id_name), "keyring", mount.get_keyring_path())
+
+    def test_session_reject(self):
+        if not isinstance(self.mount_a, FuseMount):
+            raise SkipTest("Requires FUSE client to inject client metadata")
+
+        self.mount_a.run_shell(["mkdir", "foo"])
+        self.mount_a.run_shell(["mkdir", "foo/bar"])
+        self.mount_a.umount_wait()
+
+        # Mount B will be my rejected client
+        self.mount_b.umount_wait()
+
+        # Configure a client that is limited to /foo/bar
+        self._configure_auth(self.mount_b, "badguy", "allow rw path=/foo/bar")
+        # Check he can mount that dir and do IO
+        self.mount_b.mount(mount_path="/foo/bar")
+        self.mount_b.wait_until_mounted()
+        self.mount_b.create_destroy()
+        self.mount_b.umount_wait()
+
+        # Configure the client to claim that its mount point metadata is /baz
+        self.set_conf("client.badguy", "client_metadata", "root=/baz")
+        # Try to mount the client, see that it fails
+        with self.assert_cluster_log("client session with non-allowable root '/baz' denied"):
+            with self.assertRaises(CommandFailedError):
+                self.mount_b.mount(mount_path="/foo/bar")
+
+    def test_session_evict_blacklisted(self):
+        """
+        Check that mds evicts blacklisted client
+        """
+        if not isinstance(self.mount_a, FuseMount):
+            self.skipTest("Requires FUSE client to use is_blacklisted()")
+
+        self.fs.set_max_mds(2)
+        self.fs.wait_for_daemons()
+        status = self.fs.status()
+
+        self.mount_a.run_shell(["mkdir", "d0", "d1"])
+        self.mount_a.setfattr("d0", "ceph.dir.pin", "0")
+        self.mount_a.setfattr("d1", "ceph.dir.pin", "1")
+        self._wait_subtrees(status, 0, [('/d0', 0), ('/d1', 1)])
+
+        self.mount_a.run_shell(["touch", "d0/f0"])
+        self.mount_a.run_shell(["touch", "d1/f0"])
+        self.mount_b.run_shell(["touch", "d0/f1"])
+        self.mount_b.run_shell(["touch", "d1/f1"])
+
+        self.assert_session_count(2, mds_id=self.fs.get_rank(rank=0, status=status)['name'])
+        self.assert_session_count(2, mds_id=self.fs.get_rank(rank=1, status=status)['name'])
+
+        mount_a_client_id = self.mount_a.get_global_id()
+        self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id],
+                         mds_id=self.fs.get_rank(rank=0, status=status)['name'])
+        self.wait_until_true(lambda: self.mount_a.is_blacklisted(), timeout=30)
+
+        # 10 seconds should be enough for evicting client
+        time.sleep(10)
+        self.assert_session_count(1, mds_id=self.fs.get_rank(rank=0, status=status)['name'])
+        self.assert_session_count(1, mds_id=self.fs.get_rank(rank=1, status=status)['name'])
+
+        self.mount_a.kill_cleanup()
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
diff --git a/qa/tasks/cephfs/test_snapshots.py b/qa/tasks/cephfs/test_snapshots.py
new file mode 100644
index 00000000..f09b645c
--- /dev/null
+++ b/qa/tasks/cephfs/test_snapshots.py
@@ -0,0 +1,530 @@
+import sys
+import logging
+import signal
+from textwrap import dedent
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.orchestra.run import CommandFailedError, Raw
+from unittest import SkipTest
+
+log = logging.getLogger(__name__)
+
+MDS_RESTART_GRACE = 60
+
+class TestSnapshots(CephFSTestCase):
+    MDSS_REQUIRED = 3
+    LOAD_SETTINGS = ["mds_max_snaps_per_dir"]
+
+    def _check_subtree(self, rank, path, status=None):
+        got_subtrees = self.fs.rank_asok(["get", "subtrees"], rank=rank, status=status)
+        for s in got_subtrees:
+            if s['dir']['path'] == path and s['auth_first'] == rank:
+                return True
+        return False
+
+    def _get_snapclient_dump(self, rank=0, status=None):
+        return self.fs.rank_asok(["dump", "snaps"], rank=rank, status=status)
+
+    def _get_snapserver_dump(self, rank=0, status=None):
+        return self.fs.rank_asok(["dump", "snaps", "--server"], rank=rank, status=status)
+
+    def _get_last_created_snap(self, rank=0, status=None):
+        return int(self._get_snapserver_dump(rank,status=status)["last_created"])
+
+    def _get_last_destroyed_snap(self, rank=0, status=None):
+        return int(self._get_snapserver_dump(rank,status=status)["last_destroyed"])
+
+    def _get_pending_snap_update(self, rank=0, status=None):
+        return self._get_snapserver_dump(rank,status=status)["pending_update"]
+
+    def _get_pending_snap_destroy(self, rank=0, status=None):
+        return self._get_snapserver_dump(rank,status=status)["pending_destroy"]
+
+    def test_kill_mdstable(self):
+        """
+        check snaptable transcation
+        """
+        if not isinstance(self.mount_a, FuseMount):
+            raise SkipTest("Require FUSE client to forcibly kill mount")
+
+        self.fs.set_allow_new_snaps(True);
+        self.fs.set_max_mds(2)
+        status = self.fs.wait_for_daemons()
+
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+        # setup subtrees
+        self.mount_a.run_shell(["mkdir", "-p", "d1/dir"])
+        self.mount_a.setfattr("d1", "ceph.dir.pin", "1")
+        self.wait_until_true(lambda: self._check_subtree(1, '/d1', status=status), timeout=30)
+
+        last_created = self._get_last_created_snap(rank=0,status=status)
+
+        # mds_kill_mdstable_at:
+        #  1: MDSTableServer::handle_prepare
+        #  2: MDSTableServer::_prepare_logged
+        #  5: MDSTableServer::handle_commit
+        #  6: MDSTableServer::_commit_logged
+        for i in [1,2,5,6]:
+            log.info("testing snapserver mds_kill_mdstable_at={0}".format(i))
+
+            status = self.fs.status()
+            rank0 = self.fs.get_rank(rank=0, status=status)
+            self.fs.rank_freeze(True, rank=0)
+            self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status)
+            proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s1{0}".format(i)], wait=False)
+            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2);
+            self.delete_mds_coredump(rank0['name']);
+
+            self.fs.rank_fail(rank=0)
+            self.fs.mds_restart(rank0['name'])
+            self.wait_for_daemon_start([rank0['name']])
+            status = self.fs.wait_for_daemons()
+
+            proc.wait()
+            last_created += 1
+            self.wait_until_true(lambda: self._get_last_created_snap(rank=0) == last_created, timeout=30)
+
+        self.set_conf("mds", "mds_reconnect_timeout", "5")
+
+        self.mount_a.run_shell(["rmdir", Raw("d1/dir/.snap/*")])
+
+        # set mds_kill_mdstable_at, also kill snapclient
+        for i in [2,5,6]:
+            log.info("testing snapserver mds_kill_mdstable_at={0}, also kill snapclient".format(i))
+            status = self.fs.status()
+            last_created = self._get_last_created_snap(rank=0, status=status)
+
+            rank0 = self.fs.get_rank(rank=0, status=status)
+            rank1 = self.fs.get_rank(rank=1, status=status)
+            self.fs.rank_freeze(True, rank=0) # prevent failover...
+            self.fs.rank_freeze(True, rank=1) # prevent failover...
+            self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status)
+            proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s2{0}".format(i)], wait=False)
+            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2);
+            self.delete_mds_coredump(rank0['name']);
+
+            self.fs.rank_signal(signal.SIGKILL, rank=1)
+
+            self.mount_a.kill()
+            self.mount_a.kill_cleanup()
+
+            self.fs.rank_fail(rank=0)
+            self.fs.mds_restart(rank0['name'])
+            self.wait_for_daemon_start([rank0['name']])
+
+            self.fs.wait_for_state('up:resolve', rank=0, timeout=MDS_RESTART_GRACE)
+            if i in [2,5]:
+                self.assertEqual(len(self._get_pending_snap_update(rank=0)), 1)
+            elif i == 6:
+                self.assertEqual(len(self._get_pending_snap_update(rank=0)), 0)
+                self.assertGreater(self._get_last_created_snap(rank=0), last_created)
+
+            self.fs.rank_fail(rank=1)
+            self.fs.mds_restart(rank1['name'])
+            self.wait_for_daemon_start([rank1['name']])
+            self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE)
+
+            if i in [2,5]:
+                self.wait_until_true(lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30)
+                if i == 2:
+                    self.assertEqual(self._get_last_created_snap(rank=0), last_created)
+                else:
+                    self.assertGreater(self._get_last_created_snap(rank=0), last_created)
+
+            self.mount_a.mount()
+            self.mount_a.wait_until_mounted()
+
+        self.mount_a.run_shell(["rmdir", Raw("d1/dir/.snap/*")])
+
+        # mds_kill_mdstable_at:
+        #  3: MDSTableClient::handle_request (got agree)
+        #  4: MDSTableClient::commit
+        #  7: MDSTableClient::handle_request (got ack)
+        for i in [3,4,7]:
+            log.info("testing snapclient mds_kill_mdstable_at={0}".format(i))
+            last_created = self._get_last_created_snap(rank=0)
+
+            status = self.fs.status()
+            rank1 = self.fs.get_rank(rank=1, status=status)
+            self.fs.rank_freeze(True, rank=1) # prevent failover...
+            self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=1, status=status)
+            proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s3{0}".format(i)], wait=False)
+            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace*2);
+            self.delete_mds_coredump(rank1['name']);
+
+            self.mount_a.kill()
+            self.mount_a.kill_cleanup()
+
+            if i in [3,4]:
+                self.assertEqual(len(self._get_pending_snap_update(rank=0)), 1)
+            elif i == 7:
+                self.assertEqual(len(self._get_pending_snap_update(rank=0)), 0)
+                self.assertGreater(self._get_last_created_snap(rank=0), last_created)
+
+            self.fs.rank_fail(rank=1)
+            self.fs.mds_restart(rank1['name'])
+            self.wait_for_daemon_start([rank1['name']])
+            status = self.fs.wait_for_daemons(timeout=MDS_RESTART_GRACE)
+
+            if i in [3,4]:
+                self.wait_until_true(lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30)
+                if i == 3:
+                    self.assertEqual(self._get_last_created_snap(rank=0), last_created)
+                else:
+                    self.assertGreater(self._get_last_created_snap(rank=0), last_created)
+
+            self.mount_a.mount()
+            self.mount_a.wait_until_mounted()
+
+        self.mount_a.run_shell(["rmdir", Raw("d1/dir/.snap/*")])
+
+        # mds_kill_mdstable_at:
+        #  3: MDSTableClient::handle_request (got agree)
+        #  8: MDSTableServer::handle_rollback
+        log.info("testing snapclient mds_kill_mdstable_at=3, snapserver mds_kill_mdstable_at=8")
+        last_created = self._get_last_created_snap(rank=0)
+
+        status = self.fs.status()
+        rank0 = self.fs.get_rank(rank=0, status=status)
+        rank1 = self.fs.get_rank(rank=1, status=status)
+        self.fs.rank_freeze(True, rank=0)
+        self.fs.rank_freeze(True, rank=1)
+        self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "8".format(i)], rank=0, status=status)
+        self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "3".format(i)], rank=1, status=status)
+        proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s4".format(i)], wait=False)
+        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace*2);
+        self.delete_mds_coredump(rank1['name']);
+
+        self.mount_a.kill()
+        self.mount_a.kill_cleanup()
+
+        self.assertEqual(len(self._get_pending_snap_update(rank=0)), 1)
+
+        self.fs.rank_fail(rank=1)
+        self.fs.mds_restart(rank1['name'])
+        self.wait_for_daemon_start([rank1['name']])
+
+        # rollback triggers assertion
+        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2);
+        self.delete_mds_coredump(rank0['name']);
+        self.fs.rank_fail(rank=0)
+        self.fs.mds_restart(rank0['name'])
+        self.wait_for_daemon_start([rank0['name']])
+        self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE)
+
+        # mds.1 should re-send rollback message
+        self.wait_until_true(lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30)
+        self.assertEqual(self._get_last_created_snap(rank=0), last_created)
+
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+    def test_snapclient_cache(self):
+        """
+        check if snapclient cache gets synced properly
+        """
+        self.fs.set_allow_new_snaps(True);
+        self.fs.set_max_mds(3)
+        status = self.fs.wait_for_daemons()
+
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+        self.mount_a.run_shell(["mkdir", "-p", "d0/d1/dir"])
+        self.mount_a.run_shell(["mkdir", "-p", "d0/d2/dir"])
+        self.mount_a.setfattr("d0", "ceph.dir.pin", "0")
+        self.mount_a.setfattr("d0/d1", "ceph.dir.pin", "1")
+        self.mount_a.setfattr("d0/d2", "ceph.dir.pin", "2")
+        self.wait_until_true(lambda: self._check_subtree(2, '/d0/d2', status=status), timeout=30)
+        self.wait_until_true(lambda: self._check_subtree(1, '/d0/d1', status=status), timeout=5)
+        self.wait_until_true(lambda: self._check_subtree(0, '/d0', status=status), timeout=5)
+
+        def _check_snapclient_cache(snaps_dump, cache_dump=None, rank=0):
+            if cache_dump is None:
+                cache_dump = self._get_snapclient_dump(rank=rank)
+            for key, value in cache_dump.items():
+                if value != snaps_dump[key]:
+                    return False
+            return True;
+
+        # sync after mksnap
+        last_created = self._get_last_created_snap(rank=0)
+        self.mount_a.run_shell(["mkdir", "d0/d1/dir/.snap/s1", "d0/d1/dir/.snap/s2"])
+        self.wait_until_true(lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30)
+        self.assertGreater(self._get_last_created_snap(rank=0), last_created)
+
+        snaps_dump = self._get_snapserver_dump(rank=0)
+        self.assertTrue(_check_snapclient_cache(snaps_dump, rank=0));
+        self.assertTrue(_check_snapclient_cache(snaps_dump, rank=1));
+        self.assertTrue(_check_snapclient_cache(snaps_dump, rank=2));
+
+        # sync after rmsnap
+        last_destroyed = self._get_last_destroyed_snap(rank=0)
+        self.mount_a.run_shell(["rmdir", "d0/d1/dir/.snap/s1"])
+        self.wait_until_true(lambda: len(self._get_pending_snap_destroy(rank=0)) == 0, timeout=30)
+        self.assertGreater(self._get_last_destroyed_snap(rank=0), last_destroyed)
+
+        snaps_dump = self._get_snapserver_dump(rank=0)
+        self.assertTrue(_check_snapclient_cache(snaps_dump, rank=0));
+        self.assertTrue(_check_snapclient_cache(snaps_dump, rank=1));
+        self.assertTrue(_check_snapclient_cache(snaps_dump, rank=2));
+
+        # sync during mds recovers
+        self.fs.rank_fail(rank=2)
+        status = self.fs.wait_for_daemons(timeout=MDS_RESTART_GRACE)
+        self.assertTrue(_check_snapclient_cache(snaps_dump, rank=2));
+
+        self.fs.rank_fail(rank=0)
+        self.fs.rank_fail(rank=1)
+        status = self.fs.wait_for_daemons()
+        self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE)
+        self.assertTrue(_check_snapclient_cache(snaps_dump, rank=0));
+        self.assertTrue(_check_snapclient_cache(snaps_dump, rank=1));
+        self.assertTrue(_check_snapclient_cache(snaps_dump, rank=2));
+
+        # kill at MDSTableClient::handle_notify_prep
+        status = self.fs.status()
+        rank2 = self.fs.get_rank(rank=2, status=status)
+        self.fs.rank_freeze(True, rank=2)
+        self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "9"], rank=2, status=status)
+        proc = self.mount_a.run_shell(["mkdir", "d0/d1/dir/.snap/s3"], wait=False)
+        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace*2);
+        self.delete_mds_coredump(rank2['name']);
+
+        # mksnap should wait for notify ack from mds.2
+        self.assertFalse(proc.finished);
+
+        # mksnap should proceed after mds.2 fails
+        self.fs.rank_fail(rank=2)
+        self.wait_until_true(lambda: proc.finished, timeout=30);
+
+        self.fs.mds_restart(rank2['name'])
+        self.wait_for_daemon_start([rank2['name']])
+        status = self.fs.wait_for_daemons(timeout=MDS_RESTART_GRACE)
+
+        self.mount_a.run_shell(["rmdir", Raw("d0/d1/dir/.snap/*")])
+
+        # kill at MDSTableClient::commit
+        # the recovering mds should sync all mds' cache when it enters resolve stage
+        self.set_conf("mds", "mds_reconnect_timeout", "5")
+        for i in range(1, 4):
+            status = self.fs.status()
+            rank2 = self.fs.get_rank(rank=2, status=status)
+            self.fs.rank_freeze(True, rank=2)
+            self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "4"], rank=2, status=status)
+            last_created = self._get_last_created_snap(rank=0)
+            proc = self.mount_a.run_shell(["mkdir", "d0/d2/dir/.snap/s{0}".format(i)], wait=False)
+            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace*2);
+            self.delete_mds_coredump(rank2['name']);
+
+            self.mount_a.kill()
+            self.mount_a.kill_cleanup()
+
+            self.assertEqual(len(self._get_pending_snap_update(rank=0)), 1)
+
+            if i in [2,4]:
+                self.fs.rank_fail(rank=0)
+            if i in [3,4]:
+                self.fs.rank_fail(rank=1)
+
+            self.fs.rank_fail(rank=2)
+            self.fs.mds_restart(rank2['name'])
+            self.wait_for_daemon_start([rank2['name']])
+            status = self.fs.wait_for_daemons(timeout=MDS_RESTART_GRACE)
+
+            rank0_cache = self._get_snapclient_dump(rank=0)
+            rank1_cache = self._get_snapclient_dump(rank=1)
+            rank2_cache = self._get_snapclient_dump(rank=2)
+
+            self.assertGreater(int(rank0_cache["last_created"]), last_created)
+            self.assertEqual(rank0_cache, rank1_cache);
+            self.assertEqual(rank0_cache, rank2_cache);
+
+            self.wait_until_true(lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30)
+
+            snaps_dump = self._get_snapserver_dump(rank=0)
+            self.assertEqual(snaps_dump["last_created"], rank0_cache["last_created"])
+            self.assertTrue(_check_snapclient_cache(snaps_dump, cache_dump=rank0_cache));
+
+            self.mount_a.mount()
+            self.mount_a.wait_until_mounted()
+
+        self.mount_a.run_shell(["rmdir", Raw("d0/d2/dir/.snap/*")])
+
+    def test_multimds_mksnap(self):
+        """
+        check if snapshot takes effect across authority subtrees
+        """
+        self.fs.set_allow_new_snaps(True);
+        self.fs.set_max_mds(2)
+        status = self.fs.wait_for_daemons()
+
+        self.mount_a.run_shell(["mkdir", "-p", "d0/d1"])
+        self.mount_a.setfattr("d0", "ceph.dir.pin", "0")
+        self.mount_a.setfattr("d0/d1", "ceph.dir.pin", "1")
+        self.wait_until_true(lambda: self._check_subtree(1, '/d0/d1', status=status), timeout=30)
+        self.wait_until_true(lambda: self._check_subtree(0, '/d0', status=status), timeout=5)
+
+        self.mount_a.write_test_pattern("d0/d1/file_a", 8 * 1024 * 1024)
+        self.mount_a.run_shell(["mkdir", "d0/.snap/s1"])
+        self.mount_a.run_shell(["rm", "-f", "d0/d1/file_a"])
+        self.mount_a.validate_test_pattern("d0/.snap/s1/d1/file_a", 8 * 1024 * 1024)
+
+        self.mount_a.run_shell(["rmdir", "d0/.snap/s1"])
+        self.mount_a.run_shell(["rm", "-rf", "d0"])
+
+    def test_multimds_past_parents(self):
+        """
+        check if past parents are properly recorded during across authority rename
+        """
+        self.fs.set_allow_new_snaps(True);
+        self.fs.set_max_mds(2)
+        status = self.fs.wait_for_daemons()
+
+        self.mount_a.run_shell(["mkdir", "d0", "d1"])
+        self.mount_a.setfattr("d0", "ceph.dir.pin", "0")
+        self.mount_a.setfattr("d1", "ceph.dir.pin", "1")
+        self.wait_until_true(lambda: self._check_subtree(1, '/d1', status=status), timeout=30)
+        self.wait_until_true(lambda: self._check_subtree(0, '/d0', status=status), timeout=5)
+
+        self.mount_a.run_shell(["mkdir", "d0/d3"])
+        self.mount_a.run_shell(["mkdir", "d0/.snap/s1"])
+        snap_name = self.mount_a.run_shell(["ls", "d0/d3/.snap"]).stdout.getvalue()
+
+        self.mount_a.run_shell(["mv", "d0/d3", "d1/d3"])
+        snap_name1 = self.mount_a.run_shell(["ls", "d1/d3/.snap"]).stdout.getvalue()
+        self.assertEqual(snap_name1, snap_name);
+
+        self.mount_a.run_shell(["rmdir", "d0/.snap/s1"])
+        snap_name1 = self.mount_a.run_shell(["ls", "d1/d3/.snap"]).stdout.getvalue()
+        self.assertEqual(snap_name1, "");
+
+        self.mount_a.run_shell(["rm", "-rf", "d0", "d1"])
+
+    def test_multimds_hardlink(self):
+        """
+        check if hardlink snapshot works in multimds setup
+        """
+        self.fs.set_allow_new_snaps(True);
+        self.fs.set_max_mds(2)
+        status = self.fs.wait_for_daemons()
+
+        self.mount_a.run_shell(["mkdir", "d0", "d1"])
+
+        self.mount_a.setfattr("d0", "ceph.dir.pin", "0")
+        self.mount_a.setfattr("d1", "ceph.dir.pin", "1")
+        self.wait_until_true(lambda: self._check_subtree(1, '/d1', status=status), timeout=30)
+        self.wait_until_true(lambda: self._check_subtree(0, '/d0', status=status), timeout=5)
+
+        self.mount_a.run_python(dedent("""
+            import os
+            open(os.path.join("{path}", "d0/file1"), 'w').write("asdf")
+            open(os.path.join("{path}", "d0/file2"), 'w').write("asdf")
+            """.format(path=self.mount_a.mountpoint)
+        ))
+
+        self.mount_a.run_shell(["ln", "d0/file1", "d1/file1"])
+        self.mount_a.run_shell(["ln", "d0/file2", "d1/file2"])
+
+        self.mount_a.run_shell(["mkdir", "d1/.snap/s1"])
+
+        self.mount_a.run_python(dedent("""
+            import os
+            open(os.path.join("{path}", "d0/file1"), 'w').write("qwer")
+            """.format(path=self.mount_a.mountpoint)
+        ))
+
+        self.mount_a.run_shell(["grep", "asdf", "d1/.snap/s1/file1"])
+
+        self.mount_a.run_shell(["rm", "-f", "d0/file2"])
+        self.mount_a.run_shell(["grep", "asdf", "d1/.snap/s1/file2"])
+
+        self.mount_a.run_shell(["rm", "-f", "d1/file2"])
+        self.mount_a.run_shell(["grep", "asdf", "d1/.snap/s1/file2"])
+
+        self.mount_a.run_shell(["rmdir", "d1/.snap/s1"])
+        self.mount_a.run_shell(["rm", "-rf", "d0", "d1"])
+
+    class SnapLimitViolationException(Exception):
+        failed_snapshot_number = -1
+
+        def __init__(self, num):
+            self.failed_snapshot_number = num
+
+    def get_snap_name(self, dir_name, sno):
+            sname = "{dir_name}/.snap/s_{sno}".format(dir_name=dir_name, sno=sno)
+            return sname
+
+    def create_snap_dir(self, sname):
+        self.mount_a.run_shell(["mkdir", sname])
+
+    def delete_dir_and_snaps(self, dir_name, snaps):
+        for sno in range(1, snaps+1, 1):
+            sname = self.get_snap_name(dir_name, sno)
+            self.mount_a.run_shell(["rmdir", sname])
+        self.mount_a.run_shell(["rmdir", dir_name])
+
+    def create_dir_and_snaps(self, dir_name, snaps):
+        self.mount_a.run_shell(["mkdir", dir_name])
+
+        for sno in range(1, snaps+1, 1):
+            sname = self.get_snap_name(dir_name, sno)
+            try:
+                self.create_snap_dir(sname)
+            except CommandFailedError as e:
+                # failing at the last mkdir beyond the limit is expected
+                if sno == snaps:
+                    log.info("failed while creating snap #{}: {}".format(sno, repr(e)))
+                    raise TestSnapshots.SnapLimitViolationException(sno)
+
+    def test_mds_max_snaps_per_dir_default_limit(self):
+        """
+        Test the newly introudced option named mds_max_snaps_per_dir
+        Default snaps limit is 100
+        Test if the default number of snapshot directories can be created
+        """
+        self.create_dir_and_snaps("accounts", int(self.mds_max_snaps_per_dir))
+        self.delete_dir_and_snaps("accounts", int(self.mds_max_snaps_per_dir))
+
+    def test_mds_max_snaps_per_dir_with_increased_limit(self):
+        """
+        Test the newly introudced option named mds_max_snaps_per_dir
+        First create 101 directories and ensure that the 101st directory
+        creation fails. Then increase the default by one and see if the
+        additional directory creation succeeds
+        """
+        # first test the default limit
+        new_limit = int(self.mds_max_snaps_per_dir)
+        self.fs.rank_asok(['config', 'set', 'mds_max_snaps_per_dir', repr(new_limit)])
+        try:
+            self.create_dir_and_snaps("accounts", new_limit + 1)
+        except TestSnapshots.SnapLimitViolationException as e:
+            if e.failed_snapshot_number == (new_limit + 1):
+                pass
+        # then increase the limit by one and test
+        new_limit = new_limit + 1
+        self.fs.rank_asok(['config', 'set', 'mds_max_snaps_per_dir', repr(new_limit)])
+        sname = self.get_snap_name("accounts", new_limit)
+        self.create_snap_dir(sname)
+        self.delete_dir_and_snaps("accounts", new_limit)
+
+    def test_mds_max_snaps_per_dir_with_reduced_limit(self):
+        """
+        Test the newly introudced option named mds_max_snaps_per_dir
+        First create 99 directories. Then reduce the limit to 98. Then try
+        creating another directory and ensure that additional directory
+        creation fails.
+        """
+        # first test the new limit
+        new_limit = int(self.mds_max_snaps_per_dir) - 1
+        self.create_dir_and_snaps("accounts", new_limit)
+        sname = self.get_snap_name("accounts", new_limit + 1)
+        # then reduce the limit by one and test
+        new_limit = new_limit - 1
+        self.fs.rank_asok(['config', 'set', 'mds_max_snaps_per_dir', repr(new_limit)])
+        try:
+            self.create_snap_dir(sname)
+        except CommandFailedError:
+            # after reducing limit we expect the new snapshot creation to fail
+            pass
+        self.delete_dir_and_snaps("accounts", new_limit + 1)
diff --git a/qa/tasks/cephfs/test_strays.py b/qa/tasks/cephfs/test_strays.py
new file mode 100644
index 00000000..f518afe7
--- /dev/null
+++ b/qa/tasks/cephfs/test_strays.py
@@ -0,0 +1,973 @@
+import json
+import time
+import logging
+from textwrap import dedent
+import datetime
+import gevent
+
+from teuthology.orchestra.run import CommandFailedError, Raw
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+
+log = logging.getLogger(__name__)
+
+
+class TestStrays(CephFSTestCase):
+    MDSS_REQUIRED = 2
+
+    OPS_THROTTLE = 1
+    FILES_THROTTLE = 2
+
+    # Range of different file sizes used in throttle test's workload
+    throttle_workload_size_range = 16
+
+    @for_teuthology
+    def test_ops_throttle(self):
+        self._test_throttling(self.OPS_THROTTLE)
+
+    @for_teuthology
+    def test_files_throttle(self):
+        self._test_throttling(self.FILES_THROTTLE)
+
+    def test_dir_deletion(self):
+        """
+        That when deleting a bunch of dentries and the containing
+        directory, everything gets purged.
+        Catches cases where the client might e.g. fail to trim
+        the unlinked dir from its cache.
+        """
+        file_count = 1000
+        create_script = dedent("""
+            import os
+
+            mount_path = "{mount_path}"
+            subdir = "delete_me"
+            size = {size}
+            file_count = {file_count}
+            os.mkdir(os.path.join(mount_path, subdir))
+            for i in range(0, file_count):
+                filename = "{{0}}_{{1}}.bin".format(i, size)
+                with open(os.path.join(mount_path, subdir, filename), 'w') as f:
+                    f.write(size * 'x')
+        """.format(
+            mount_path=self.mount_a.mountpoint,
+            size=1024,
+            file_count=file_count
+        ))
+
+        self.mount_a.run_python(create_script)
+
+        # That the dirfrag object is created
+        self.fs.mds_asok(["flush", "journal"])
+        dir_ino = self.mount_a.path_to_ino("delete_me")
+        self.assertTrue(self.fs.dirfrag_exists(dir_ino, 0))
+
+        # Remove everything
+        self.mount_a.run_shell(["rm", "-rf", "delete_me"])
+        self.fs.mds_asok(["flush", "journal"])
+
+        # That all the removed files get created as strays
+        strays = self.get_mdc_stat("strays_created")
+        self.assertEqual(strays, file_count + 1)
+
+        # That the strays all get enqueued for purge
+        self.wait_until_equal(
+            lambda: self.get_mdc_stat("strays_enqueued"),
+            strays,
+            timeout=600
+
+        )
+
+        # That all the purge operations execute
+        self.wait_until_equal(
+            lambda: self.get_stat("purge_queue", "pq_executed"),
+            strays,
+            timeout=600
+        )
+
+        # That finally, the directory metadata object is gone
+        self.assertFalse(self.fs.dirfrag_exists(dir_ino, 0))
+
+        # That finally, the data objects are all gone
+        self.await_data_pool_empty()
+
+    def _test_throttling(self, throttle_type):
+        self.data_log = []
+        try:
+            return self._do_test_throttling(throttle_type)
+        except:
+            for l in self.data_log:
+                log.info(",".join([l_.__str__() for l_ in l]))
+            raise
+
+    def _do_test_throttling(self, throttle_type):
+        """
+        That the mds_max_purge_ops setting is respected
+        """
+
+        def set_throttles(files, ops):
+            """
+            Helper for updating ops/files limits, and calculating effective
+            ops_per_pg setting to give the same ops limit.
+            """
+            self.set_conf('mds', 'mds_max_purge_files', "%d" % files)
+            self.set_conf('mds', 'mds_max_purge_ops', "%d" % ops)
+
+            pgs = self.fs.mon_manager.get_pool_property(
+                self.fs.get_data_pool_name(),
+                "pg_num"
+            )
+            ops_per_pg = float(ops) / pgs
+            self.set_conf('mds', 'mds_max_purge_ops_per_pg', "%s" % ops_per_pg)
+
+        # Test conditions depend on what we're going to be exercising.
+        # * Lift the threshold on whatever throttle we are *not* testing, so
+        #   that the throttle of interest is the one that will be the bottleneck
+        # * Create either many small files (test file count throttling) or fewer
+        #   large files (test op throttling)
+        if throttle_type == self.OPS_THROTTLE:
+            set_throttles(files=100000000, ops=16)
+            size_unit = 1024 * 1024  # big files, generate lots of ops
+            file_multiplier = 100
+        elif throttle_type == self.FILES_THROTTLE:
+            # The default value of file limit is pretty permissive, so to avoid
+            # the test running too fast, create lots of files and set the limit
+            # pretty low.
+            set_throttles(ops=100000000, files=6)
+            size_unit = 1024  # small, numerous files
+            file_multiplier = 200
+        else:
+            raise NotImplementedError(throttle_type)
+
+        # Pick up config changes
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        create_script = dedent("""
+            import os
+
+            mount_path = "{mount_path}"
+            subdir = "delete_me"
+            size_unit = {size_unit}
+            file_multiplier = {file_multiplier}
+            os.mkdir(os.path.join(mount_path, subdir))
+            for i in range(0, file_multiplier):
+                for size in range(0, {size_range}*size_unit, size_unit):
+                    filename = "{{0}}_{{1}}.bin".format(i, size // size_unit)
+                    with open(os.path.join(mount_path, subdir, filename), 'w') as f:
+                        f.write(size * 'x')
+        """.format(
+            mount_path=self.mount_a.mountpoint,
+            size_unit=size_unit,
+            file_multiplier=file_multiplier,
+            size_range=self.throttle_workload_size_range
+        ))
+
+        self.mount_a.run_python(create_script)
+
+        # We will run the deletion in the background, to reduce the risk of it completing before
+        # we have started monitoring the stray statistics.
+        def background():
+            self.mount_a.run_shell(["rm", "-rf", "delete_me"])
+            self.fs.mds_asok(["flush", "journal"])
+
+        background_thread = gevent.spawn(background)
+
+        total_inodes = file_multiplier * self.throttle_workload_size_range + 1
+        mds_max_purge_ops = int(self.fs.get_config("mds_max_purge_ops", 'mds'))
+        mds_max_purge_files = int(self.fs.get_config("mds_max_purge_files", 'mds'))
+
+        # During this phase we look for the concurrent ops to exceed half
+        # the limit (a heuristic) and not exceed the limit (a correctness
+        # condition).
+        purge_timeout = 600
+        elapsed = 0
+        files_high_water = 0
+        ops_high_water = 0
+
+        while True:
+            stats = self.fs.mds_asok(['perf', 'dump'])
+            mdc_stats = stats['mds_cache']
+            pq_stats = stats['purge_queue']
+            if elapsed >= purge_timeout:
+                raise RuntimeError("Timeout waiting for {0} inodes to purge, stats:{1}".format(total_inodes, mdc_stats))
+
+            num_strays = mdc_stats['num_strays']
+            num_strays_purging = pq_stats['pq_executing']
+            num_purge_ops = pq_stats['pq_executing_ops']
+            files_high_water = pq_stats['pq_executing_high_water']
+            ops_high_water = pq_stats['pq_executing_ops_high_water']
+
+            self.data_log.append([datetime.datetime.now(), num_strays, num_strays_purging, num_purge_ops, files_high_water, ops_high_water])
+
+            total_strays_created = mdc_stats['strays_created']
+            total_strays_purged = pq_stats['pq_executed']
+
+            if total_strays_purged == total_inodes:
+                log.info("Complete purge in {0} seconds".format(elapsed))
+                break
+            elif total_strays_purged > total_inodes:
+                raise RuntimeError("Saw more strays than expected, mdc stats: {0}".format(mdc_stats))
+            else:
+                if throttle_type == self.OPS_THROTTLE:
+                    # 11 is filer_max_purge_ops plus one for the backtrace:
+                    # limit is allowed to be overshot by this much.
+                    if num_purge_ops > mds_max_purge_ops + 11:
+                        raise RuntimeError("num_purge_ops violates threshold {0}/{1}".format(
+                            num_purge_ops, mds_max_purge_ops
+                        ))
+                elif throttle_type == self.FILES_THROTTLE:
+                    if num_strays_purging > mds_max_purge_files:
+                        raise RuntimeError("num_strays_purging violates threshold {0}/{1}".format(
+                            num_strays_purging, mds_max_purge_files
+                        ))
+                else:
+                    raise NotImplementedError(throttle_type)
+
+                log.info("Waiting for purge to complete {0}/{1}, {2}/{3}".format(
+                    num_strays_purging, num_strays,
+                    total_strays_purged, total_strays_created
+                ))
+                time.sleep(1)
+                elapsed += 1
+
+        background_thread.join()
+
+        # Check that we got up to a respectable rate during the purge.  This is totally
+        # racy, but should be safeish unless the cluster is pathologically slow, or
+        # insanely fast such that the deletions all pass before we have polled the
+        # statistics.
+        if throttle_type == self.OPS_THROTTLE:
+            if ops_high_water < mds_max_purge_ops // 2:
+                raise RuntimeError("Ops in flight high water is unexpectedly low ({0} / {1})".format(
+                    ops_high_water, mds_max_purge_ops
+                ))
+            # The MDS may go over mds_max_purge_ops for some items, like a
+            # heavily fragmented directory.  The throttle does not kick in
+            # until *after* we reach or exceed the limit.  This is expected
+            # because we don't want to starve the PQ or never purge a
+            # particularly large file/directory.
+            self.assertLessEqual(ops_high_water, mds_max_purge_ops+64)
+        elif throttle_type == self.FILES_THROTTLE:
+            if files_high_water < mds_max_purge_files // 2:
+                raise RuntimeError("Files in flight high water is unexpectedly low ({0} / {1})".format(
+                    files_high_water, mds_max_purge_files
+                ))
+            self.assertLessEqual(files_high_water, mds_max_purge_files)
+
+        # Sanity check all MDC stray stats
+        stats = self.fs.mds_asok(['perf', 'dump'])
+        mdc_stats = stats['mds_cache']
+        pq_stats = stats['purge_queue']
+        self.assertEqual(mdc_stats['num_strays'], 0)
+        self.assertEqual(mdc_stats['num_strays_delayed'], 0)
+        self.assertEqual(pq_stats['pq_executing'], 0)
+        self.assertEqual(pq_stats['pq_executing_ops'], 0)
+        self.assertEqual(mdc_stats['strays_created'], total_inodes)
+        self.assertEqual(mdc_stats['strays_enqueued'], total_inodes)
+        self.assertEqual(pq_stats['pq_executed'], total_inodes)
+
+    def get_mdc_stat(self, name, mds_id=None):
+        return self.get_stat("mds_cache", name, mds_id)
+
+    def get_stat(self, subsys, name, mds_id=None):
+        return self.fs.mds_asok(['perf', 'dump', subsys, name],
+                                mds_id=mds_id)[subsys][name]
+
+    def _wait_for_counter(self, subsys, counter, expect_val, timeout=60,
+                          mds_id=None):
+        self.wait_until_equal(
+            lambda: self.get_stat(subsys, counter, mds_id),
+            expect_val=expect_val, timeout=timeout,
+            reject_fn=lambda x: x > expect_val
+        )
+
+    def test_open_inode(self):
+        """
+        That the case of a dentry unlinked while a client holds an
+        inode open is handled correctly.
+
+        The inode should be moved into a stray dentry, while the original
+        dentry and directory should be purged.
+
+        The inode's data should be purged when the client eventually closes
+        it.
+        """
+        mount_a_client_id = self.mount_a.get_global_id()
+
+        # Write some bytes to a file
+        size_mb = 8
+
+        # Hold the file open
+        p = self.mount_a.open_background("open_file")
+        self.mount_a.write_n_mb("open_file", size_mb)
+        open_file_ino = self.mount_a.path_to_ino("open_file")
+
+        self.assertEqual(self.get_session(mount_a_client_id)['num_caps'], 2)
+
+        # Unlink the dentry
+        self.mount_a.run_shell(["rm", "-f", "open_file"])
+
+        # Wait to see the stray count increment
+        self.wait_until_equal(
+            lambda: self.get_mdc_stat("num_strays"),
+            expect_val=1, timeout=60, reject_fn=lambda x: x > 1)
+
+        # See that while the stray count has incremented, none have passed
+        # on to the purge queue
+        self.assertEqual(self.get_mdc_stat("strays_created"), 1)
+        self.assertEqual(self.get_mdc_stat("strays_enqueued"), 0)
+
+        # See that the client still holds 2 caps
+        self.assertEqual(self.get_session(mount_a_client_id)['num_caps'], 2)
+
+        # See that the data objects remain in the data pool
+        self.assertTrue(self.fs.data_objects_present(open_file_ino, size_mb * 1024 * 1024))
+
+        # Now close the file
+        self.mount_a.kill_background(p)
+
+        # Wait to see the client cap count decrement
+        self.wait_until_equal(
+            lambda: self.get_session(mount_a_client_id)['num_caps'],
+            expect_val=1, timeout=60, reject_fn=lambda x: x > 2 or x < 1
+        )
+        # Wait to see the purge counter increment, stray count go to zero
+        self._wait_for_counter("mds_cache", "strays_enqueued", 1)
+        self.wait_until_equal(
+            lambda: self.get_mdc_stat("num_strays"),
+            expect_val=0, timeout=6, reject_fn=lambda x: x > 1
+        )
+        self._wait_for_counter("purge_queue", "pq_executed", 1)
+
+        # See that the data objects no longer exist
+        self.assertTrue(self.fs.data_objects_absent(open_file_ino, size_mb * 1024 * 1024))
+
+        self.await_data_pool_empty()
+
+    def test_hardlink_reintegration(self):
+        """
+        That removal of primary dentry of hardlinked inode results
+        in reintegration of inode into the previously-remote dentry,
+        rather than lingering as a stray indefinitely.
+        """
+        # Write some bytes to file_a
+        size_mb = 8
+        self.mount_a.run_shell(["mkdir", "dir_1"])
+        self.mount_a.write_n_mb("dir_1/file_a", size_mb)
+        ino = self.mount_a.path_to_ino("dir_1/file_a")
+
+        # Create a hardlink named file_b
+        self.mount_a.run_shell(["mkdir", "dir_2"])
+        self.mount_a.run_shell(["ln", "dir_1/file_a", "dir_2/file_b"])
+        self.assertEqual(self.mount_a.path_to_ino("dir_2/file_b"), ino)
+
+        # Flush journal
+        self.fs.mds_asok(['flush', 'journal'])
+
+        # See that backtrace for the file points to the file_a path
+        pre_unlink_bt = self.fs.read_backtrace(ino)
+        self.assertEqual(pre_unlink_bt['ancestors'][0]['dname'], "file_a")
+
+        # empty mds cache. otherwise mds reintegrates stray when unlink finishes
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(['flush', 'journal'])
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+        self.mount_a.mount()
+
+        # Unlink file_a
+        self.mount_a.run_shell(["rm", "-f", "dir_1/file_a"])
+
+        # See that a stray was created
+        self.assertEqual(self.get_mdc_stat("num_strays"), 1)
+        self.assertEqual(self.get_mdc_stat("strays_created"), 1)
+
+        # Wait, see that data objects are still present (i.e. that the
+        # stray did not advance to purging given time)
+        time.sleep(30)
+        self.assertTrue(self.fs.data_objects_present(ino, size_mb * 1024 * 1024))
+        self.assertEqual(self.get_mdc_stat("strays_enqueued"), 0)
+
+        # See that before reintegration, the inode's backtrace points to a stray dir
+        self.fs.mds_asok(['flush', 'journal'])
+        self.assertTrue(self.get_backtrace_path(ino).startswith("stray"))
+
+        last_reintegrated = self.get_mdc_stat("strays_reintegrated")
+
+        # Do a metadata operation on the remaining link (mv is heavy handed, but
+        # others like touch may be satisfied from caps without poking MDS)
+        self.mount_a.run_shell(["mv", "dir_2/file_b", "dir_2/file_c"])
+
+        # Stray reintegration should happen as a result of the eval_remote call
+        # on responding to a client request.
+        self.wait_until_equal(
+            lambda: self.get_mdc_stat("num_strays"),
+            expect_val=0,
+            timeout=60
+        )
+
+        # See the reintegration counter increment
+        curr_reintegrated = self.get_mdc_stat("strays_reintegrated")
+        self.assertGreater(curr_reintegrated, last_reintegrated)
+        last_reintegrated = curr_reintegrated
+
+        # Flush the journal
+        self.fs.mds_asok(['flush', 'journal'])
+
+        # See that the backtrace for the file points to the remaining link's path
+        post_reint_bt = self.fs.read_backtrace(ino)
+        self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "file_c")
+
+        # mds should reintegrates stray when unlink finishes
+        self.mount_a.run_shell(["ln", "dir_2/file_c", "dir_2/file_d"])
+        self.mount_a.run_shell(["rm", "-f", "dir_2/file_c"])
+
+        # Stray reintegration should happen as a result of the notify_stray call
+        # on completion of unlink
+        self.wait_until_equal(
+            lambda: self.get_mdc_stat("num_strays"),
+            expect_val=0,
+            timeout=60
+        )
+
+        # See the reintegration counter increment
+        curr_reintegrated = self.get_mdc_stat("strays_reintegrated")
+        self.assertGreater(curr_reintegrated, last_reintegrated)
+        last_reintegrated = curr_reintegrated
+
+        # Flush the journal
+        self.fs.mds_asok(['flush', 'journal'])
+
+        # See that the backtrace for the file points to the newest link's path
+        post_reint_bt = self.fs.read_backtrace(ino)
+        self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "file_d")
+
+        # Now really delete it
+        self.mount_a.run_shell(["rm", "-f", "dir_2/file_d"])
+        self._wait_for_counter("mds_cache", "strays_enqueued", 1)
+        self._wait_for_counter("purge_queue", "pq_executed", 1)
+
+        self.assert_purge_idle()
+        self.assertTrue(self.fs.data_objects_absent(ino, size_mb * 1024 * 1024))
+
+        # We caused the inode to go stray 3 times
+        self.assertEqual(self.get_mdc_stat("strays_created"), 3)
+        # We purged it at the last
+        self.assertEqual(self.get_mdc_stat("strays_enqueued"), 1)
+
+    def test_mv_hardlink_cleanup(self):
+        """
+        That when doing a rename from A to B, and B has hardlinks,
+        then we make a stray for B which is then reintegrated
+        into one of his hardlinks.
+        """
+        # Create file_a, file_b, and a hardlink to file_b
+        size_mb = 8
+        self.mount_a.write_n_mb("file_a", size_mb)
+        file_a_ino = self.mount_a.path_to_ino("file_a")
+
+        self.mount_a.write_n_mb("file_b", size_mb)
+        file_b_ino = self.mount_a.path_to_ino("file_b")
+
+        self.mount_a.run_shell(["ln", "file_b", "linkto_b"])
+        self.assertEqual(self.mount_a.path_to_ino("linkto_b"), file_b_ino)
+
+        # mv file_a file_b
+        self.mount_a.run_shell(["mv", "file_a", "file_b"])
+
+        # Stray reintegration should happen as a result of the notify_stray call on
+        # completion of rename
+        self.wait_until_equal(
+            lambda: self.get_mdc_stat("num_strays"),
+            expect_val=0,
+            timeout=60
+        )
+
+        self.assertEqual(self.get_mdc_stat("strays_created"), 1)
+        self.assertGreaterEqual(self.get_mdc_stat("strays_reintegrated"), 1)
+
+        # No data objects should have been deleted, as both files still have linkage.
+        self.assertTrue(self.fs.data_objects_present(file_a_ino, size_mb * 1024 * 1024))
+        self.assertTrue(self.fs.data_objects_present(file_b_ino, size_mb * 1024 * 1024))
+
+        self.fs.mds_asok(['flush', 'journal'])
+
+        post_reint_bt = self.fs.read_backtrace(file_b_ino)
+        self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "linkto_b")
+
+    def _setup_two_ranks(self):
+        # Set up two MDSs
+        self.fs.set_max_mds(2)
+
+        # See that we have two active MDSs
+        self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
+                              reject_fn=lambda v: v > 2 or v < 1)
+
+        active_mds_names = self.fs.get_active_names()
+        rank_0_id = active_mds_names[0]
+        rank_1_id = active_mds_names[1]
+        log.info("Ranks 0 and 1 are {0} and {1}".format(
+            rank_0_id, rank_1_id))
+
+        # Get rid of other MDS daemons so that it's easier to know which
+        # daemons to expect in which ranks after restarts
+        for unneeded_mds in set(self.mds_cluster.mds_ids) - {rank_0_id, rank_1_id}:
+            self.mds_cluster.mds_stop(unneeded_mds)
+            self.mds_cluster.mds_fail(unneeded_mds)
+
+        return rank_0_id, rank_1_id
+
+    def _force_migrate(self, to_id, path, watch_ino):
+        """
+        :param to_id: MDS id to move it to
+        :param path: Filesystem path (string) to move
+        :param watch_ino: Inode number to look for at destination to confirm move
+        :return: None
+        """
+        self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "1", path])
+
+        # Poll the MDS cache dump to watch for the export completing
+        migrated = False
+        migrate_timeout = 60
+        migrate_elapsed = 0
+        while not migrated:
+            data = self.fs.mds_asok(["dump", "cache"], to_id)
+            for inode_data in data:
+                if inode_data['ino'] == watch_ino:
+                    log.debug("Found ino in cache: {0}".format(json.dumps(inode_data, indent=2)))
+                    if inode_data['is_auth'] is True:
+                        migrated = True
+                    break
+
+            if not migrated:
+                if migrate_elapsed > migrate_timeout:
+                    raise RuntimeError("Migration hasn't happened after {0}s!".format(migrate_elapsed))
+                else:
+                    migrate_elapsed += 1
+                    time.sleep(1)
+
+    def _is_stopped(self, rank):
+        mds_map = self.fs.get_mds_map()
+        return rank not in [i['rank'] for i in mds_map['info'].values()]
+
+    def test_purge_on_shutdown(self):
+        """
+        That when an MDS rank is shut down, its purge queue is
+        drained in the process.
+        """
+        rank_0_id, rank_1_id = self._setup_two_ranks()
+
+        self.set_conf("mds.{0}".format(rank_1_id), 'mds_max_purge_files', "0")
+        self.mds_cluster.mds_fail_restart(rank_1_id)
+        self.fs.wait_for_daemons()
+
+        file_count = 5
+
+        self.mount_a.create_n_files("delete_me/file", file_count)
+
+        self._force_migrate(rank_1_id, "delete_me",
+                            self.mount_a.path_to_ino("delete_me/file_0"))
+
+        self.mount_a.run_shell(["rm", "-rf", Raw("delete_me/*")])
+        self.mount_a.umount_wait()
+
+        # See all the strays go into purge queue
+        self._wait_for_counter("mds_cache", "strays_created", file_count, mds_id=rank_1_id)
+        self._wait_for_counter("mds_cache", "strays_enqueued", file_count, mds_id=rank_1_id)
+        self.assertEqual(self.get_stat("mds_cache", "num_strays", mds_id=rank_1_id), 0)
+
+        # See nothing get purged from the purge queue (yet)
+        time.sleep(10)
+        self.assertEqual(self.get_stat("purge_queue", "pq_executed", mds_id=rank_1_id), 0)
+
+        # Shut down rank 1
+        self.fs.set_max_mds(1)
+
+        # It shouldn't proceed past stopping because its still not allowed
+        # to purge
+        time.sleep(10)
+        self.assertEqual(self.get_stat("purge_queue", "pq_executed", mds_id=rank_1_id), 0)
+        self.assertFalse(self._is_stopped(1))
+
+        # Permit the daemon to start purging again
+        self.fs.mon_manager.raw_cluster_cmd('tell', 'mds.{0}'.format(rank_1_id),
+                                            'injectargs',
+                                            "--mds_max_purge_files 100")
+
+        # It should now proceed through shutdown
+        self.fs.wait_for_daemons(timeout=120)
+
+        # ...and in the process purge all that data
+        self.await_data_pool_empty()
+
+    def test_migration_on_shutdown(self):
+        """
+        That when an MDS rank is shut down, any non-purgeable strays
+        get migrated to another rank.
+        """
+
+        rank_0_id, rank_1_id = self._setup_two_ranks()
+
+        # Create a non-purgeable stray in a ~mds1 stray directory
+        # by doing a hard link and deleting the original file
+        self.mount_a.run_shell(["mkdir", "dir_1", "dir_2"])
+        self.mount_a.run_shell(["touch", "dir_1/original"])
+        self.mount_a.run_shell(["ln", "dir_1/original", "dir_2/linkto"])
+
+        self._force_migrate(rank_1_id, "dir_1",
+                            self.mount_a.path_to_ino("dir_1/original"))
+
+        # empty mds cache. otherwise mds reintegrates stray when unlink finishes
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(['flush', 'journal'], rank_0_id)
+        self.fs.mds_asok(['flush', 'journal'], rank_1_id)
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        active_mds_names = self.fs.get_active_names()
+        rank_0_id = active_mds_names[0]
+        rank_1_id = active_mds_names[1]
+
+        self.mount_a.mount()
+
+        self.mount_a.run_shell(["rm", "-f", "dir_1/original"])
+        self.mount_a.umount_wait()
+
+        self._wait_for_counter("mds_cache", "strays_created", 1,
+                               mds_id=rank_1_id)
+
+        # Shut down rank 1
+        self.fs.set_max_mds(1)
+        self.fs.wait_for_daemons(timeout=120)
+
+        # See that the stray counter on rank 0 has incremented
+        self.assertEqual(self.get_mdc_stat("strays_created", rank_0_id), 1)
+
+    def assert_backtrace(self, ino, expected_path):
+        """
+        Assert that the backtrace in the data pool for an inode matches
+        an expected /foo/bar path.
+        """
+        expected_elements = expected_path.strip("/").split("/")
+        bt = self.fs.read_backtrace(ino)
+        actual_elements = list(reversed([dn['dname'] for dn in bt['ancestors']]))
+        self.assertListEqual(expected_elements, actual_elements)
+
+    def get_backtrace_path(self, ino):
+        bt = self.fs.read_backtrace(ino)
+        elements = reversed([dn['dname'] for dn in bt['ancestors']])
+        return "/".join(elements)
+
+    def assert_purge_idle(self):
+        """
+        Assert that the MDS perf counters indicate no strays exist and
+        no ongoing purge activity.  Sanity check for when PurgeQueue should
+        be idle.
+        """
+        mdc_stats = self.fs.mds_asok(['perf', 'dump', "mds_cache"])['mds_cache']
+        pq_stats = self.fs.mds_asok(['perf', 'dump', "purge_queue"])['purge_queue']
+        self.assertEqual(mdc_stats["num_strays"], 0)
+        self.assertEqual(mdc_stats["num_strays_delayed"], 0)
+        self.assertEqual(pq_stats["pq_executing"], 0)
+        self.assertEqual(pq_stats["pq_executing_ops"], 0)
+
+    def test_mv_cleanup(self):
+        """
+        That when doing a rename from A to B, and B has no hardlinks,
+        then we make a stray for B and purge him.
+        """
+        # Create file_a and file_b, write some to both
+        size_mb = 8
+        self.mount_a.write_n_mb("file_a", size_mb)
+        file_a_ino = self.mount_a.path_to_ino("file_a")
+        self.mount_a.write_n_mb("file_b", size_mb)
+        file_b_ino = self.mount_a.path_to_ino("file_b")
+
+        self.fs.mds_asok(['flush', 'journal'])
+        self.assert_backtrace(file_a_ino, "file_a")
+        self.assert_backtrace(file_b_ino, "file_b")
+
+        # mv file_a file_b
+        self.mount_a.run_shell(['mv', 'file_a', 'file_b'])
+
+        # See that stray counter increments
+        self.assertEqual(self.get_mdc_stat("strays_created"), 1)
+        # Wait for purge counter to increment
+        self._wait_for_counter("mds_cache", "strays_enqueued", 1)
+        self._wait_for_counter("purge_queue", "pq_executed", 1)
+
+        self.assert_purge_idle()
+
+        # file_b should have been purged
+        self.assertTrue(self.fs.data_objects_absent(file_b_ino, size_mb * 1024 * 1024))
+
+        # Backtrace should have updated from file_a to file_b
+        self.fs.mds_asok(['flush', 'journal'])
+        self.assert_backtrace(file_a_ino, "file_b")
+
+        # file_a's data should still exist
+        self.assertTrue(self.fs.data_objects_present(file_a_ino, size_mb * 1024 * 1024))
+
+    def _pool_df(self, pool_name):
+        """
+        Return a dict like
+            {
+                "kb_used": 0,
+                "bytes_used": 0,
+                "max_avail": 19630292406,
+                "objects": 0
+            }
+
+        :param pool_name: Which pool (must exist)
+        """
+        out = self.fs.mon_manager.raw_cluster_cmd("df", "--format=json-pretty")
+        for p in json.loads(out)['pools']:
+            if p['name'] == pool_name:
+                return p['stats']
+
+        raise RuntimeError("Pool '{0}' not found".format(pool_name))
+
+    def await_data_pool_empty(self):
+        self.wait_until_true(
+            lambda: self._pool_df(
+                self.fs.get_data_pool_name()
+            )['objects'] == 0,
+            timeout=60)
+
+    def test_snapshot_remove(self):
+        """
+        That removal of a snapshot that references a now-unlinked file results
+        in purging on the stray for the file.
+        """
+        # Enable snapshots
+        self.fs.set_allow_new_snaps(True)
+
+        # Create a dir with a file in it
+        size_mb = 8
+        self.mount_a.run_shell(["mkdir", "snapdir"])
+        self.mount_a.run_shell(["mkdir", "snapdir/subdir"])
+        self.mount_a.write_test_pattern("snapdir/subdir/file_a", size_mb * 1024 * 1024)
+        file_a_ino = self.mount_a.path_to_ino("snapdir/subdir/file_a")
+
+        # Snapshot the dir
+        self.mount_a.run_shell(["mkdir", "snapdir/.snap/snap1"])
+
+        # Cause the head revision to deviate from the snapshot
+        self.mount_a.write_n_mb("snapdir/subdir/file_a", size_mb)
+
+        # Flush the journal so that backtraces, dirfrag objects will actually be written
+        self.fs.mds_asok(["flush", "journal"])
+
+        # Unlink the file
+        self.mount_a.run_shell(["rm", "-f", "snapdir/subdir/file_a"])
+        self.mount_a.run_shell(["rmdir", "snapdir/subdir"])
+
+        # Unmount the client because when I come back to check the data is still
+        # in the file I don't want to just see what's in the page cache.
+        self.mount_a.umount_wait()
+
+        self.assertEqual(self.get_mdc_stat("strays_created"), 2)
+
+        # FIXME: at this stage we see a purge and the stray count drops to
+        # zero, but there's actually still a stray, so at the very
+        # least the StrayManager stats code is slightly off
+
+        self.mount_a.mount()
+
+        # See that the data from the snapshotted revision of the file is still present
+        # and correct
+        self.mount_a.validate_test_pattern("snapdir/.snap/snap1/subdir/file_a", size_mb * 1024 * 1024)
+
+        # Remove the snapshot
+        self.mount_a.run_shell(["rmdir", "snapdir/.snap/snap1"])
+
+        # Purging file_a doesn't happen until after we've flushed the journal, because
+        # it is referenced by the snapshotted subdir, and the snapshot isn't really
+        # gone until the journal references to it are gone
+        self.fs.mds_asok(["flush", "journal"])
+
+        # Wait for purging to complete, which requires the OSDMap to propagate to the OSDs.
+        # See also: http://tracker.ceph.com/issues/20072
+        self.wait_until_true(
+            lambda: self.fs.data_objects_absent(file_a_ino, size_mb * 1024 * 1024),
+            timeout=60
+        )
+
+        # See that a purge happens now
+        self._wait_for_counter("mds_cache", "strays_enqueued", 2)
+        self._wait_for_counter("purge_queue", "pq_executed", 2)
+
+        self.await_data_pool_empty()
+
+    def test_fancy_layout(self):
+        """
+        purge stray file with fancy layout
+        """
+
+        file_name = "fancy_layout_file"
+        self.mount_a.run_shell(["touch", file_name])
+
+        file_layout = "stripe_unit=1048576 stripe_count=4 object_size=8388608"
+        self.mount_a.setfattr(file_name, "ceph.file.layout", file_layout)
+
+        # 35MB requires 7 objects
+        size_mb = 35
+        self.mount_a.write_n_mb(file_name, size_mb)
+
+        self.mount_a.run_shell(["rm", "-f", file_name])
+        self.fs.mds_asok(["flush", "journal"])
+
+        # can't use self.fs.data_objects_absent here, it does not support fancy layout
+        self.await_data_pool_empty()
+
+    def test_dirfrag_limit(self):
+        """
+        That the directory fragment size cannot exceed mds_bal_fragment_size_max (using a limit of 50 in all configurations).
+
+        That fragmentation (forced) will allow more entries to be created.
+
+        That unlinking fails when the stray directory fragment becomes too large and that unlinking may continue once those strays are purged.
+        """
+
+        LOW_LIMIT = 50
+        for mds in self.fs.get_daemon_names():
+            self.fs.mds_asok(["config", "set", "mds_bal_fragment_size_max", str(LOW_LIMIT)], mds)
+
+        try:
+            self.mount_a.run_python(dedent("""
+                import os
+                path = os.path.join("{path}", "subdir")
+                os.mkdir(path)
+                for n in range(0, {file_count}):
+                    with open(os.path.join(path, "%s" % n), 'w') as f:
+                        f.write(str(n))
+                """.format(
+            path=self.mount_a.mountpoint,
+            file_count=LOW_LIMIT+1
+            )))
+        except CommandFailedError:
+            pass # ENOSPAC
+        else:
+            raise RuntimeError("fragment size exceeded")
+
+        # Now test that we can go beyond the limit if we fragment the directory
+
+        self.mount_a.run_python(dedent("""
+            import os
+            path = os.path.join("{path}", "subdir2")
+            os.mkdir(path)
+            for n in range(0, {file_count}):
+                with open(os.path.join(path, "%s" % n), 'w') as f:
+                    f.write(str(n))
+            dfd = os.open(path, os.O_DIRECTORY)
+            os.fsync(dfd)
+            """.format(
+        path=self.mount_a.mountpoint,
+        file_count=LOW_LIMIT
+        )))
+
+        # Ensure that subdir2 is fragmented
+        mds_id = self.fs.get_active_names()[0]
+        self.fs.mds_asok(["dirfrag", "split", "/subdir2", "0/0", "1"], mds_id)
+
+        # remount+flush (release client caps)
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(["flush", "journal"], mds_id)
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        # Create 50% more files than the current fragment limit
+        self.mount_a.run_python(dedent("""
+            import os
+            path = os.path.join("{path}", "subdir2")
+            for n in range({file_count}, ({file_count}*3)//2):
+                with open(os.path.join(path, "%s" % n), 'w') as f:
+                    f.write(str(n))
+            """.format(
+        path=self.mount_a.mountpoint,
+        file_count=LOW_LIMIT
+        )))
+
+        # Now test the stray directory size is limited and recovers
+        strays_before = self.get_mdc_stat("strays_created")
+        try:
+            self.mount_a.run_python(dedent("""
+                import os
+                path = os.path.join("{path}", "subdir3")
+                os.mkdir(path)
+                for n in range({file_count}):
+                    fpath = os.path.join(path, "%s" % n)
+                    with open(fpath, 'w') as f:
+                        f.write(str(n))
+                    os.unlink(fpath)
+                """.format(
+            path=self.mount_a.mountpoint,
+            file_count=LOW_LIMIT*10 # 10 stray directories, should collide before this count
+            )))
+        except CommandFailedError:
+            pass # ENOSPAC
+        else:
+            raise RuntimeError("fragment size exceeded")
+
+        strays_after = self.get_mdc_stat("strays_created")
+        self.assertGreaterEqual(strays_after-strays_before, LOW_LIMIT)
+
+        self._wait_for_counter("mds_cache", "strays_enqueued", strays_after)
+        self._wait_for_counter("purge_queue", "pq_executed", strays_after)
+
+        self.mount_a.run_python(dedent("""
+            import os
+            path = os.path.join("{path}", "subdir4")
+            os.mkdir(path)
+            for n in range({file_count}):
+                fpath = os.path.join(path, "%s" % n)
+                with open(fpath, 'w') as f:
+                    f.write(str(n))
+                os.unlink(fpath)
+            """.format(
+        path=self.mount_a.mountpoint,
+        file_count=LOW_LIMIT
+        )))
+
+    def test_purge_queue_upgrade(self):
+        """
+        That when starting on a system with no purge queue in the metadata
+        pool, we silently create one.
+        :return:
+        """
+
+        self.mds_cluster.mds_stop()
+        self.mds_cluster.mds_fail()
+        self.fs.rados(["rm", "500.00000000"])
+        self.mds_cluster.mds_restart()
+        self.fs.wait_for_daemons()
+
+    def test_replicated_delete_speed(self):
+        """
+        That deletions of replicated metadata are not pathologically slow
+        """
+        rank_0_id, rank_1_id = self._setup_two_ranks()
+
+        self.set_conf("mds.{0}".format(rank_1_id), 'mds_max_purge_files', "0")
+        self.mds_cluster.mds_fail_restart(rank_1_id)
+        self.fs.wait_for_daemons()
+
+        file_count = 10
+
+        self.mount_a.create_n_files("delete_me/file", file_count)
+
+        self._force_migrate(rank_1_id, "delete_me",
+                            self.mount_a.path_to_ino("delete_me/file_0"))
+
+        begin = datetime.datetime.now()
+        self.mount_a.run_shell(["rm", "-rf", Raw("delete_me/*")])
+        end = datetime.datetime.now()
+
+        # What we're really checking here is that we are completing client
+        # operations immediately rather than delaying until the next tick.
+        tick_period = float(self.fs.get_config("mds_tick_interval",
+                                               service_type="mds"))
+
+        duration = (end - begin).total_seconds()
+        self.assertLess(duration, (file_count * tick_period) * 0.25)
+
diff --git a/qa/tasks/cephfs/test_volume_client.py b/qa/tasks/cephfs/test_volume_client.py
new file mode 100644
index 00000000..7f66218c
--- /dev/null
+++ b/qa/tasks/cephfs/test_volume_client.py
@@ -0,0 +1,1765 @@
+import json
+import logging
+import os
+from textwrap import dedent
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from tasks.cephfs.fuse_mount import FuseMount
+from teuthology.exceptions import CommandFailedError
+from teuthology.misc import sudo_write_file
+
+log = logging.getLogger(__name__)
+
+
+class TestVolumeClient(CephFSTestCase):
+    # One for looking at the global filesystem, one for being
+    # the VolumeClient, two for mounting the created shares
+    CLIENTS_REQUIRED = 4
+
+    def setUp(self):
+        CephFSTestCase.setUp(self)
+
+    def _volume_client_python(self, client, script, vol_prefix=None, ns_prefix=None):
+        # Can't dedent this *and* the script we pass in, because they might have different
+        # levels of indentation to begin with, so leave this string zero-indented
+        if vol_prefix:
+            vol_prefix = "\"" + vol_prefix + "\""
+        if ns_prefix:
+            ns_prefix = "\"" + ns_prefix + "\""
+        return client.run_python("""
+from __future__ import print_function
+from ceph_volume_client import CephFSVolumeClient, VolumePath
+from sys import version_info as sys_version_info
+from rados import OSError as rados_OSError
+import logging
+log = logging.getLogger("ceph_volume_client")
+log.addHandler(logging.StreamHandler())
+log.setLevel(logging.DEBUG)
+vc = CephFSVolumeClient("manila", "{conf_path}", "ceph", {vol_prefix}, {ns_prefix})
+vc.connect()
+{payload}
+vc.disconnect()
+        """.format(payload=script, conf_path=client.config_path,
+                   vol_prefix=vol_prefix, ns_prefix=ns_prefix))
+
+    def _configure_vc_auth(self, mount, id_name):
+        """
+        Set up auth credentials for the VolumeClient user
+        """
+        out = self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "get-or-create", "client.{name}".format(name=id_name),
+            "mds", "allow *",
+            "osd", "allow rw",
+            "mon", "allow *"
+        )
+        mount.client_id = id_name
+        sudo_write_file(mount.client_remote, mount.get_keyring_path(), out)
+        self.set_conf("client.{name}".format(name=id_name), "keyring", mount.get_keyring_path())
+
+    def _configure_guest_auth(self, volumeclient_mount, guest_mount,
+                              guest_entity, mount_path,
+                              namespace_prefix=None, readonly=False,
+                              tenant_id=None, allow_existing_id=False):
+        """
+        Set up auth credentials for the guest client to mount a volume.
+
+        :param volumeclient_mount: mount used as the handle for driving
+                                   volumeclient.
+        :param guest_mount: mount used by the guest client.
+        :param guest_entity: auth ID used by the guest client.
+        :param mount_path: path of the volume.
+        :param namespace_prefix: name prefix of the RADOS namespace, which
+                                 is used for the volume's layout.
+        :param readonly: defaults to False. If set to 'True' only read-only
+                         mount access is granted to the guest.
+        :param tenant_id: (OpenStack) tenant ID of the guest client.
+        """
+
+        head, volume_id = os.path.split(mount_path)
+        head, group_id = os.path.split(head)
+        head, volume_prefix = os.path.split(head)
+        volume_prefix = "/" + volume_prefix
+
+        # Authorize the guest client's auth ID to mount the volume.
+        key = self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            auth_result = vc.authorize(vp, "{guest_entity}", readonly={readonly},
+                                       tenant_id="{tenant_id}",
+                                       allow_existing_id="{allow_existing_id}")
+            print(auth_result['auth_key'])
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity=guest_entity,
+            readonly=readonly,
+            tenant_id=tenant_id,
+            allow_existing_id=allow_existing_id)), volume_prefix, namespace_prefix
+        )
+
+        # CephFSVolumeClient's authorize() does not return the secret
+        # key to a caller who isn't multi-tenant aware. Explicitly
+        # query the key for such a client.
+        if not tenant_id:
+            key = self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "get-key", "client.{name}".format(name=guest_entity),
+            )
+
+        # The guest auth ID should exist.
+        existing_ids = [a['entity'] for a in self.auth_list()]
+        self.assertIn("client.{0}".format(guest_entity), existing_ids)
+
+        # Create keyring file for the guest client.
+        keyring_txt = dedent("""
+        [client.{guest_entity}]
+            key = {key}
+
+        """.format(
+            guest_entity=guest_entity,
+            key=key
+        ))
+        guest_mount.client_id = guest_entity
+        sudo_write_file(guest_mount.client_remote,
+                        guest_mount.get_keyring_path(), keyring_txt)
+
+        # Add a guest client section to the ceph config file.
+        self.set_conf("client.{0}".format(guest_entity), "client quota", "True")
+        self.set_conf("client.{0}".format(guest_entity), "debug client", "20")
+        self.set_conf("client.{0}".format(guest_entity), "debug objecter", "20")
+        self.set_conf("client.{0}".format(guest_entity),
+                      "keyring", guest_mount.get_keyring_path())
+
+    def test_default_prefix(self):
+        group_id = "grpid"
+        volume_id = "volid"
+        DEFAULT_VOL_PREFIX = "volumes"
+        DEFAULT_NS_PREFIX = "fsvolumens_"
+
+        self.mount_b.umount_wait()
+        self._configure_vc_auth(self.mount_b, "manila")
+
+        #create a volume with default prefix
+        self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 10, data_isolated=True)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        # The dir should be created
+        self.mount_a.stat(os.path.join(DEFAULT_VOL_PREFIX, group_id, volume_id))
+
+        #namespace should be set
+        ns_in_attr = self.mount_a.getfattr(os.path.join(DEFAULT_VOL_PREFIX, group_id, volume_id), "ceph.dir.layout.pool_namespace")
+        namespace = "{0}{1}".format(DEFAULT_NS_PREFIX, volume_id)
+        self.assertEqual(namespace, ns_in_attr)
+
+
+    def test_lifecycle(self):
+        """
+        General smoke test for create, extend, destroy
+        """
+
+        # I'm going to use mount_c later as a guest for mounting the created
+        # shares
+        self.mounts[2].umount_wait()
+
+        # I'm going to leave mount_b unmounted and just use it as a handle for
+        # driving volumeclient.  It's a little hacky but we don't have a more
+        # general concept for librados/libcephfs clients as opposed to full
+        # blown mounting clients.
+        self.mount_b.umount_wait()
+        self._configure_vc_auth(self.mount_b, "manila")
+
+        guest_entity = "guest"
+        group_id = "grpid"
+        volume_id = "volid"
+
+        volume_prefix = "/myprefix"
+        namespace_prefix = "mynsprefix_"
+
+        # Create a 100MB volume
+        volume_size = 100
+        mount_path = self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 1024*1024*{volume_size})
+            print(create_result['mount_path'])
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            volume_size=volume_size
+        )), volume_prefix, namespace_prefix)
+
+        # The dir should be created
+        self.mount_a.stat(os.path.join("myprefix", group_id, volume_id))
+
+        # Authorize and configure credentials for the guest to mount the
+        # the volume.
+        self._configure_guest_auth(self.mount_b, self.mounts[2], guest_entity,
+                                   mount_path, namespace_prefix)
+        self.mounts[2].mount(mount_path=mount_path)
+
+        # The kernel client doesn't have the quota-based df behaviour,
+        # or quotas at all, so only exercise the client behaviour when
+        # running fuse.
+        if isinstance(self.mounts[2], FuseMount):
+            # df should see volume size, same as the quota set on volume's dir
+            self.assertEqual(self.mounts[2].df()['total'],
+                             volume_size * 1024 * 1024)
+            self.assertEqual(
+                    self.mount_a.getfattr(
+                        os.path.join(volume_prefix.strip("/"), group_id, volume_id),
+                        "ceph.quota.max_bytes"),
+                    "%s" % (volume_size * 1024 * 1024))
+
+            # df granularity is 4MB block so have to write at least that much
+            data_bin_mb = 4
+            self.mounts[2].write_n_mb("data.bin", data_bin_mb)
+
+            # Write something outside volume to check this space usage is
+            # not reported in the volume's DF.
+            other_bin_mb = 8
+            self.mount_a.write_n_mb("other.bin", other_bin_mb)
+
+            # global: df should see all the writes (data + other).  This is a >
+            # rather than a == because the global spaced used includes all pools
+            def check_df():
+                used = self.mount_a.df()['used']
+                return used >= (other_bin_mb * 1024 * 1024)
+
+            self.wait_until_true(check_df, timeout=30)
+
+            # Hack: do a metadata IO to kick rstats
+            self.mounts[2].run_shell(["touch", "foo"])
+
+            # volume: df should see the data_bin_mb consumed from quota, same
+            # as the rbytes for the volume's dir
+            self.wait_until_equal(
+                    lambda: self.mounts[2].df()['used'],
+                    data_bin_mb * 1024 * 1024, timeout=60)
+            self.wait_until_equal(
+                    lambda: self.mount_a.getfattr(
+                        os.path.join(volume_prefix.strip("/"), group_id, volume_id),
+                        "ceph.dir.rbytes"),
+                    "%s" % (data_bin_mb * 1024 * 1024), timeout=60)
+
+            # sync so that file data are persist to rados
+            self.mounts[2].run_shell(["sync"])
+
+            # Our data should stay in particular rados namespace
+            pool_name = self.mount_a.getfattr(os.path.join("myprefix", group_id, volume_id), "ceph.dir.layout.pool")
+            namespace = "{0}{1}".format(namespace_prefix, volume_id)
+            ns_in_attr = self.mount_a.getfattr(os.path.join("myprefix", group_id, volume_id), "ceph.dir.layout.pool_namespace")
+            self.assertEqual(namespace, ns_in_attr)
+
+            objects_in_ns = set(self.fs.rados(["ls"], pool=pool_name, namespace=namespace).split("\n"))
+            self.assertNotEqual(objects_in_ns, set())
+
+            # De-authorize the guest
+            self._volume_client_python(self.mount_b, dedent("""
+                vp = VolumePath("{group_id}", "{volume_id}")
+                vc.deauthorize(vp, "{guest_entity}")
+                vc.evict("{guest_entity}")
+            """.format(
+                group_id=group_id,
+                volume_id=volume_id,
+                guest_entity=guest_entity
+            )), volume_prefix, namespace_prefix)
+
+            # Once deauthorized, the client should be unable to do any more metadata ops
+            # The way that the client currently behaves here is to block (it acts like
+            # it has lost network, because there is nothing to tell it that is messages
+            # are being dropped because it's identity is gone)
+            background = self.mounts[2].write_n_mb("rogue.bin", 1, wait=False)
+            try:
+                background.wait()
+            except CommandFailedError:
+                # command failed with EBLACKLISTED?
+                if "transport endpoint shutdown" in background.stderr.getvalue():
+                    pass
+                else:
+                    raise
+
+            # After deauthorisation, the client ID should be gone (this was the only
+            # volume it was authorised for)
+            self.assertNotIn("client.{0}".format(guest_entity), [e['entity'] for e in self.auth_list()])
+
+            # Clean up the dead mount (ceph-fuse's behaviour here is a bit undefined)
+            self.mounts[2].umount_wait()
+
+        self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+            vc.purge_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )), volume_prefix, namespace_prefix)
+
+    def test_idempotency(self):
+        """
+        That the volumeclient interface works when calling everything twice
+        """
+        self.mount_b.umount_wait()
+        self._configure_vc_auth(self.mount_b, "manila")
+
+        guest_entity = "guest"
+        group_id = "grpid"
+        volume_id = "volid"
+        self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 10)
+            vc.create_volume(vp, 10)
+            vc.authorize(vp, "{guest_entity}")
+            vc.authorize(vp, "{guest_entity}")
+            vc.deauthorize(vp, "{guest_entity}")
+            vc.deauthorize(vp, "{guest_entity}")
+            vc.delete_volume(vp)
+            vc.delete_volume(vp)
+            vc.purge_volume(vp)
+            vc.purge_volume(vp)
+
+            vc.create_volume(vp, 10, data_isolated=True)
+            vc.create_volume(vp, 10, data_isolated=True)
+            vc.authorize(vp, "{guest_entity}")
+            vc.authorize(vp, "{guest_entity}")
+            vc.deauthorize(vp, "{guest_entity}")
+            vc.deauthorize(vp, "{guest_entity}")
+            vc.evict("{guest_entity}")
+            vc.evict("{guest_entity}")
+            vc.delete_volume(vp, data_isolated=True)
+            vc.delete_volume(vp, data_isolated=True)
+            vc.purge_volume(vp, data_isolated=True)
+            vc.purge_volume(vp, data_isolated=True)
+
+            vc.create_volume(vp, 10, namespace_isolated=False)
+            vc.create_volume(vp, 10, namespace_isolated=False)
+            vc.authorize(vp, "{guest_entity}")
+            vc.authorize(vp, "{guest_entity}")
+            vc.deauthorize(vp, "{guest_entity}")
+            vc.deauthorize(vp, "{guest_entity}")
+            vc.evict("{guest_entity}")
+            vc.evict("{guest_entity}")
+            vc.delete_volume(vp)
+            vc.delete_volume(vp)
+            vc.purge_volume(vp)
+            vc.purge_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity=guest_entity
+        )))
+
+    def test_data_isolated(self):
+        """
+        That data isolated shares get their own pool
+        :return:
+        """
+
+        # Because the teuthology config template sets mon_max_pg_per_osd to
+        # 10000 (i.e. it just tries to ignore health warnings), reset it to something
+        # sane before using volume_client, to avoid creating pools with absurdly large
+        # numbers of PGs.
+        self.set_conf("global", "mon max pg per osd", "300")
+        for mon_daemon_state in self.ctx.daemons.iter_daemons_of_role('mon'):
+            mon_daemon_state.restart()
+
+        self.mount_b.umount_wait()
+        self._configure_vc_auth(self.mount_b, "manila")
+
+        # Calculate how many PGs we'll expect the new volume pool to have
+        osd_map = json.loads(self.fs.mon_manager.raw_cluster_cmd('osd', 'dump', '--format=json-pretty'))
+        max_per_osd = int(self.fs.get_config('mon_max_pg_per_osd'))
+        osd_count = len(osd_map['osds'])
+        max_overall = osd_count * max_per_osd
+
+        existing_pg_count = 0
+        for p in osd_map['pools']:
+            existing_pg_count += p['pg_num']
+
+        expected_pg_num = (max_overall - existing_pg_count) // 10
+        log.info("max_per_osd {0}".format(max_per_osd))
+        log.info("osd_count {0}".format(osd_count))
+        log.info("max_overall {0}".format(max_overall))
+        log.info("existing_pg_count {0}".format(existing_pg_count))
+        log.info("expected_pg_num {0}".format(expected_pg_num))
+
+        pools_a = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools']
+
+        group_id = "grpid"
+        volume_id = "volid"
+        self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 10, data_isolated=True)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        pools_b = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools']
+
+        # Should have created one new pool
+        new_pools = set(p['pool_name'] for p in pools_b) - set([p['pool_name'] for p in pools_a])
+        self.assertEqual(len(new_pools), 1)
+
+        # It should have followed the heuristic for PG count
+        # (this is an overly strict test condition, so we may want to remove
+        #  it at some point as/when the logic gets fancier)
+        created_pg_num = self.fs.mon_manager.get_pool_property(list(new_pools)[0], "pg_num")
+        self.assertEqual(expected_pg_num, created_pg_num)
+
+    def test_15303(self):
+        """
+        Reproducer for #15303 "Client holds incorrect complete flag on dir
+        after losing caps" (http://tracker.ceph.com/issues/15303)
+        """
+        for m in self.mounts:
+            m.umount_wait()
+
+        # Create a dir on mount A
+        self.mount_a.mount()
+        self.mount_a.run_shell(["mkdir", "parent1"])
+        self.mount_a.run_shell(["mkdir", "parent2"])
+        self.mount_a.run_shell(["mkdir", "parent1/mydir"])
+
+        # Put some files in it from mount B
+        self.mount_b.mount()
+        self.mount_b.run_shell(["touch", "parent1/mydir/afile"])
+        self.mount_b.umount_wait()
+
+        # List the dir's contents on mount A
+        self.assertListEqual(self.mount_a.ls("parent1/mydir"),
+                             ["afile"])
+
+    def test_evict_client(self):
+        """
+        That a volume client can be evicted based on its auth ID and the volume
+        path it has mounted.
+        """
+
+        if not isinstance(self.mount_a, FuseMount):
+            self.skipTest("Requires FUSE client to inject client metadata")
+
+        # mounts[1] would be used as handle for driving VolumeClient. mounts[2]
+        # and mounts[3] would be used as guests to mount the volumes/shares.
+
+        for i in range(1, 4):
+            self.mounts[i].umount_wait()
+
+        volumeclient_mount = self.mounts[1]
+        self._configure_vc_auth(volumeclient_mount, "manila")
+        guest_mounts = (self.mounts[2], self.mounts[3])
+
+        guest_entity = "guest"
+        group_id = "grpid"
+        mount_paths = []
+        volume_ids = []
+
+        # Create two volumes. Authorize 'guest' auth ID to mount the two
+        # volumes. Mount the two volumes. Write data to the volumes.
+        for i in range(2):
+            # Create volume.
+            volume_ids.append("volid_{0}".format(str(i)))
+            mount_paths.append(
+                self._volume_client_python(volumeclient_mount, dedent("""
+                    vp = VolumePath("{group_id}", "{volume_id}")
+                    create_result = vc.create_volume(vp, 10 * 1024 * 1024)
+                    print(create_result['mount_path'])
+                """.format(
+                    group_id=group_id,
+                    volume_id=volume_ids[i]
+            ))))
+
+            # Authorize 'guest' auth ID to mount the volume.
+            self._configure_guest_auth(volumeclient_mount, guest_mounts[i],
+                                       guest_entity, mount_paths[i])
+
+            # Mount the volume.
+            guest_mounts[i].mountpoint_dir_name = 'mnt.{id}.{suffix}'.format(
+                id=guest_entity, suffix=str(i))
+            guest_mounts[i].mount(mount_path=mount_paths[i])
+            guest_mounts[i].write_n_mb("data.bin", 1)
+
+
+        # Evict client, guest_mounts[0], using auth ID 'guest' and has mounted
+        # one volume.
+        self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity}")
+            vc.evict("{guest_entity}", volume_path=vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_ids[0],
+            guest_entity=guest_entity
+        )))
+
+        # Evicted guest client, guest_mounts[0], should not be able to do
+        # anymore metadata ops.  It should start failing all operations
+        # when it sees that its own address is in the blacklist.
+        try:
+            guest_mounts[0].write_n_mb("rogue.bin", 1)
+        except CommandFailedError:
+            pass
+        else:
+            raise RuntimeError("post-eviction write should have failed!")
+
+        # The blacklisted guest client should now be unmountable
+        guest_mounts[0].umount_wait()
+
+        # Guest client, guest_mounts[1], using the same auth ID 'guest', but
+        # has mounted the other volume, should be able to use its volume
+        # unaffected.
+        guest_mounts[1].write_n_mb("data.bin.1", 1)
+
+        # Cleanup.
+        for i in range(2):
+            self._volume_client_python(volumeclient_mount, dedent("""
+                vp = VolumePath("{group_id}", "{volume_id}")
+                vc.deauthorize(vp, "{guest_entity}")
+                vc.delete_volume(vp)
+                vc.purge_volume(vp)
+            """.format(
+                group_id=group_id,
+                volume_id=volume_ids[i],
+                guest_entity=guest_entity
+            )))
+
+
+    def test_purge(self):
+        """
+        Reproducer for #15266, exception trying to purge volumes that
+        contain non-ascii filenames.
+
+        Additionally test any other purge corner cases here.
+        """
+        # I'm going to leave mount_b unmounted and just use it as a handle for
+        # driving volumeclient.  It's a little hacky but we don't have a more
+        # general concept for librados/libcephfs clients as opposed to full
+        # blown mounting clients.
+        self.mount_b.umount_wait()
+        self._configure_vc_auth(self.mount_b, "manila")
+
+        group_id = "grpid"
+        # Use a unicode volume ID (like Manila), to reproduce #15266
+        volume_id = u"volid"
+
+        # Create
+        mount_path = self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", u"{volume_id}")
+            create_result = vc.create_volume(vp, 10)
+            print(create_result['mount_path'])
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id
+        )))
+
+        # Strip leading "/"
+        mount_path = mount_path[1:]
+
+        # A file with non-ascii characters
+        self.mount_a.run_shell(["touch", os.path.join(mount_path, u"b\u00F6b")])
+
+        # A file with no permissions to do anything
+        self.mount_a.run_shell(["touch", os.path.join(mount_path, "noperms")])
+        self.mount_a.run_shell(["chmod", "0000", os.path.join(mount_path, "noperms")])
+
+        self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", u"{volume_id}")
+            vc.delete_volume(vp)
+            vc.purge_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id
+        )))
+
+        # Check it's really gone
+        self.assertEqual(self.mount_a.ls("volumes/_deleting"), [])
+        self.assertEqual(self.mount_a.ls("volumes/"), ["_deleting", group_id])
+
+    def test_readonly_authorization(self):
+        """
+        That guest clients can be restricted to read-only mounts of volumes.
+        """
+
+        volumeclient_mount = self.mounts[1]
+        guest_mount = self.mounts[2]
+        volumeclient_mount.umount_wait()
+        guest_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        guest_entity = "guest"
+        group_id = "grpid"
+        volume_id = "volid"
+
+        # Create a volume.
+        mount_path = self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 1024*1024*10)
+            print(create_result['mount_path'])
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        # Authorize and configure credentials for the guest to mount the
+        # the volume with read-write access.
+        self._configure_guest_auth(volumeclient_mount, guest_mount, guest_entity,
+                                   mount_path, readonly=False)
+
+        # Mount the volume, and write to it.
+        guest_mount.mount(mount_path=mount_path)
+        guest_mount.write_n_mb("data.bin", 1)
+
+        # Change the guest auth ID's authorization to read-only mount access.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity=guest_entity
+        )))
+        self._configure_guest_auth(volumeclient_mount, guest_mount, guest_entity,
+                                   mount_path, readonly=True)
+
+        # The effect of the change in access level to read-only is not
+        # immediate. The guest sees the change only after a remount of
+        # the volume.
+        guest_mount.umount_wait()
+        guest_mount.mount(mount_path=mount_path)
+
+        # Read existing content of the volume.
+        self.assertListEqual(guest_mount.ls(guest_mount.mountpoint), ["data.bin"])
+        # Cannot write into read-only volume.
+        try:
+            guest_mount.write_n_mb("rogue.bin", 1)
+        except CommandFailedError:
+            pass
+
+    def test_get_authorized_ids(self):
+        """
+        That for a volume, the authorized IDs and their access levels
+        can be obtained using CephFSVolumeClient's get_authorized_ids().
+        """
+        volumeclient_mount = self.mounts[1]
+        volumeclient_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        group_id = "grpid"
+        volume_id = "volid"
+        guest_entity_1 = "guest1"
+        guest_entity_2 = "guest2"
+
+        log.info("print(group ID: {0})".format(group_id))
+
+        # Create a volume.
+        auths = self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 1024*1024*10)
+            auths = vc.get_authorized_ids(vp)
+            print(auths)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+        # Check the list of authorized IDs for the volume.
+        self.assertEqual('None', auths)
+
+        # Allow two auth IDs access to the volume.
+        auths = self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{guest_entity_1}", readonly=False)
+            vc.authorize(vp, "{guest_entity_2}", readonly=True)
+            auths = vc.get_authorized_ids(vp)
+            print(auths)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity_1=guest_entity_1,
+            guest_entity_2=guest_entity_2,
+        )))
+        # Check the list of authorized IDs and their access levels.
+        expected_result = [('guest1', 'rw'), ('guest2', 'r')]
+        self.assertCountEqual(str(expected_result), auths)
+
+        # Disallow both the auth IDs' access to the volume.
+        auths = self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity_1}")
+            vc.deauthorize(vp, "{guest_entity_2}")
+            auths = vc.get_authorized_ids(vp)
+            print(auths)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity_1=guest_entity_1,
+            guest_entity_2=guest_entity_2,
+        )))
+        # Check the list of authorized IDs for the volume.
+        self.assertEqual('None', auths)
+
+    def test_multitenant_volumes(self):
+        """
+        That volume access can be restricted to a tenant.
+
+        That metadata used to enforce tenant isolation of
+        volumes is stored as a two-way mapping between auth
+        IDs and volumes that they're authorized to access.
+        """
+        volumeclient_mount = self.mounts[1]
+        volumeclient_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        group_id = "groupid"
+        volume_id = "volumeid"
+
+        # Guest clients belonging to different tenants, but using the same
+        # auth ID.
+        auth_id = "guest"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+        guestclient_2 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant2",
+        }
+
+        # Create a volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 1024*1024*10)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        # Check that volume metadata file is created on volume creation.
+        vol_metadata_filename = "_{0}:{1}.meta".format(group_id, volume_id)
+        self.assertIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Authorize 'guestclient_1', using auth ID 'guest' and belonging to
+        # 'tenant1', with 'rw' access to the volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            auth_id=guestclient_1["auth_id"],
+            tenant_id=guestclient_1["tenant_id"]
+        )))
+
+        # Check that auth metadata file for auth ID 'guest', is
+        # created on authorizing 'guest' access to the volume.
+        auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"])
+        self.assertIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Verify that the auth metadata file stores the tenant ID that the
+        # auth ID belongs to, the auth ID's authorized access levels
+        # for different volumes, versioning details, etc.
+        expected_auth_metadata = {
+            "version": 2,
+            "compat_version": 6,
+            "dirty": False,
+            "tenant_id": "tenant1",
+            "subvolumes": {
+                "groupid/volumeid": {
+                    "dirty": False,
+                    "access_level": "rw"
+                }
+            }
+        }
+
+        auth_metadata = self._volume_client_python(volumeclient_mount, dedent("""
+            import json
+            vp = VolumePath("{group_id}", "{volume_id}")
+            auth_metadata = vc._auth_metadata_get("{auth_id}")
+            print(json.dumps(auth_metadata))
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            auth_id=guestclient_1["auth_id"],
+        )))
+        auth_metadata = json.loads(auth_metadata)
+
+        self.assertGreaterEqual(auth_metadata["version"], expected_auth_metadata["version"])
+        del expected_auth_metadata["version"]
+        del auth_metadata["version"]
+        self.assertEqual(expected_auth_metadata, auth_metadata)
+
+        # Verify that the volume metadata file stores info about auth IDs
+        # and their access levels to the volume, versioning details, etc.
+        expected_vol_metadata = {
+            "version": 2,
+            "compat_version": 1,
+            "auths": {
+                "guest": {
+                    "dirty": False,
+                    "access_level": "rw"
+                }
+            }
+        }
+
+        vol_metadata = self._volume_client_python(volumeclient_mount, dedent("""
+            import json
+            vp = VolumePath("{group_id}", "{volume_id}")
+            volume_metadata = vc._volume_metadata_get(vp)
+            print(json.dumps(volume_metadata))
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+        vol_metadata = json.loads(vol_metadata)
+
+        self.assertGreaterEqual(vol_metadata["version"], expected_vol_metadata["version"])
+        del expected_vol_metadata["version"]
+        del vol_metadata["version"]
+        self.assertEqual(expected_vol_metadata, vol_metadata)
+
+        # Cannot authorize 'guestclient_2' to access the volume.
+        # It uses auth ID 'guest', which has already been used by a
+        # 'guestclient_1' belonging to an another tenant for accessing
+        # the volume.
+        with self.assertRaises(CommandFailedError):
+            self._volume_client_python(volumeclient_mount, dedent("""
+                vp = VolumePath("{group_id}", "{volume_id}")
+                vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+            """.format(
+                group_id=group_id,
+                volume_id=volume_id,
+                auth_id=guestclient_2["auth_id"],
+                tenant_id=guestclient_2["tenant_id"]
+            )))
+
+        # Check that auth metadata file is cleaned up on removing
+        # auth ID's only access to a volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity=guestclient_1["auth_id"]
+        )))
+
+        self.assertNotIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Check that volume metadata file is cleaned up on volume deletion.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+        self.assertNotIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
+
+    def test_authorize_auth_id_not_created_by_ceph_volume_client(self):
+        """
+        If the auth_id already exists and is not created by
+        ceph_volume_client, it's not allowed to authorize
+        the auth-id by default.
+        """
+        volumeclient_mount = self.mounts[1]
+        volumeclient_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        group_id = "groupid"
+        volume_id = "volumeid"
+
+        # Create auth_id
+        self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "get-or-create", "client.guest1",
+            "mds", "allow *",
+            "osd", "allow rw",
+            "mon", "allow *"
+        )
+
+        auth_id = "guest1"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # Create a volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 1024*1024*10)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        # Cannot authorize 'guestclient_1' to access the volume.
+        # It uses auth ID 'guest1', which already exists and not
+        # created by ceph_volume_client
+        with self.assertRaises(CommandFailedError):
+            self._volume_client_python(volumeclient_mount, dedent("""
+                vp = VolumePath("{group_id}", "{volume_id}")
+                vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+            """.format(
+                group_id=group_id,
+                volume_id=volume_id,
+                auth_id=guestclient_1["auth_id"],
+                tenant_id=guestclient_1["tenant_id"]
+            )))
+
+        # Delete volume
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+    def test_authorize_allow_existing_id_option(self):
+        """
+        If the auth_id already exists and is not created by
+        ceph_volume_client, it's not allowed to authorize
+        the auth-id by default but is allowed with option
+        allow_existing_id.
+        """
+        volumeclient_mount = self.mounts[1]
+        volumeclient_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        group_id = "groupid"
+        volume_id = "volumeid"
+
+        # Create auth_id
+        self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "get-or-create", "client.guest1",
+            "mds", "allow *",
+            "osd", "allow rw",
+            "mon", "allow *"
+        )
+
+        auth_id = "guest1"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # Create a volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 1024*1024*10)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        # Cannot authorize 'guestclient_1' to access the volume
+        # by default, which already exists and not created by
+        # ceph_volume_client but is allowed with option 'allow_existing_id'.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}",
+                         allow_existing_id="{allow_existing_id}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            auth_id=guestclient_1["auth_id"],
+            tenant_id=guestclient_1["tenant_id"],
+            allow_existing_id=True
+        )))
+
+        # Delete volume
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+    def test_deauthorize_auth_id_after_out_of_band_update(self):
+        """
+        If the auth_id authorized by ceph_volume_client is updated
+        out of band, the auth_id should not be deleted after a
+        deauthorize. It should only remove caps associated it.
+        """
+        volumeclient_mount = self.mounts[1]
+        volumeclient_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        group_id = "groupid"
+        volume_id = "volumeid"
+
+
+        auth_id = "guest1"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # Create a volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 1024*1024*10)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        # Authorize 'guestclient_1' to access the volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            auth_id=guestclient_1["auth_id"],
+            tenant_id=guestclient_1["tenant_id"]
+        )))
+
+        # Update caps for guestclient_1 out of band
+        out = self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "caps", "client.guest1",
+            "mds", "allow rw path=/volumes/groupid, allow rw path=/volumes/groupid/volumeid",
+            "osd", "allow rw pool=cephfs_data namespace=fsvolumens_volumeid",
+            "mon", "allow r",
+            "mgr", "allow *"
+        )
+
+        # Deauthorize guestclient_1
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity=guestclient_1["auth_id"]
+        )))
+
+        # Validate the caps of guestclient_1 after deauthorize. It should not have deleted
+        # guestclient_1. The mgr and mds caps should be present which was updated out of band.
+        out = json.loads(self.fs.mon_manager.raw_cluster_cmd("auth", "get", "client.guest1", "--format=json-pretty"))
+
+        self.assertEqual("client.guest1", out[0]["entity"])
+        self.assertEqual("allow rw path=/volumes/groupid", out[0]["caps"]["mds"])
+        self.assertEqual("allow *", out[0]["caps"]["mgr"])
+        self.assertNotIn("osd", out[0]["caps"])
+
+        # Delete volume
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+    def test_recover_metadata(self):
+        """
+        That volume client can recover from partial auth updates using
+        metadata files, which store auth info and its update status info.
+        """
+        volumeclient_mount = self.mounts[1]
+        volumeclient_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        group_id = "groupid"
+        volume_id = "volumeid"
+
+        guestclient = {
+            "auth_id": "guest",
+            "tenant_id": "tenant",
+        }
+
+        # Create a volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 1024*1024*10)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        # Authorize 'guestclient' access to the volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            auth_id=guestclient["auth_id"],
+            tenant_id=guestclient["tenant_id"]
+        )))
+
+        # Check that auth metadata file for auth ID 'guest' is created.
+        auth_metadata_filename = "${0}.meta".format(guestclient["auth_id"])
+        self.assertIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Induce partial auth update state by modifying the auth metadata file,
+        # and then run recovery procedure.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            auth_metadata = vc._auth_metadata_get("{auth_id}")
+            auth_metadata['dirty'] = True
+            vc._auth_metadata_set("{auth_id}", auth_metadata)
+            vc.recover()
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            auth_id=guestclient["auth_id"],
+        )))
+
+    def test_update_old_style_auth_metadata_to_new_during_recover(self):
+        """
+        From nautilus onwards 'volumes' created by ceph_volume_client were
+        renamed and used as CephFS subvolumes accessed via the ceph-mgr
+        interface. Hence it makes sense to store the subvolume data in
+        auth-metadata file with 'subvolumes' key instead of 'volumes' key.
+        This test validates the transparent update of 'volumes' key to
+        'subvolumes' key in auth metadata file during recover.
+        """
+        volumeclient_mount = self.mounts[1]
+        volumeclient_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        group_id = "groupid"
+        volume_id = "volumeid"
+
+        guestclient = {
+            "auth_id": "guest",
+            "tenant_id": "tenant",
+        }
+
+        # Create a volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 1024*1024*10)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        # Check that volume metadata file is created on volume creation.
+        vol_metadata_filename = "_{0}:{1}.meta".format(group_id, volume_id)
+        self.assertIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Authorize 'guestclient' access to the volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            auth_id=guestclient["auth_id"],
+            tenant_id=guestclient["tenant_id"]
+        )))
+
+        # Check that auth metadata file for auth ID 'guest' is created.
+        auth_metadata_filename = "${0}.meta".format(guestclient["auth_id"])
+        self.assertIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Replace 'subvolumes' to 'volumes', old style auth-metadata file
+        self.mounts[0].run_shell(['sed', '-i', 's/subvolumes/volumes/g', 'volumes/{0}'.format(auth_metadata_filename)])
+
+        # Verify that the auth metadata file stores the tenant ID that the
+        # auth ID belongs to, the auth ID's authorized access levels
+        # for different volumes, versioning details, etc.
+        expected_auth_metadata = {
+            "version": 2,
+            "compat_version": 6,
+            "dirty": False,
+            "tenant_id": "tenant",
+            "subvolumes": {
+                "groupid/volumeid": {
+                    "dirty": False,
+                    "access_level": "rw"
+                }
+            }
+        }
+
+        # Induce partial auth update state by modifying the auth metadata file,
+        # and then run recovery procedure. This should also update 'volumes' key
+        # to 'subvolumes'.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            auth_metadata = vc._auth_metadata_get("{auth_id}")
+            auth_metadata['dirty'] = True
+            vc._auth_metadata_set("{auth_id}", auth_metadata)
+            vc.recover()
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            auth_id=guestclient["auth_id"],
+        )))
+
+        auth_metadata = self._volume_client_python(volumeclient_mount, dedent("""
+            import json
+            auth_metadata = vc._auth_metadata_get("{auth_id}")
+            print(json.dumps(auth_metadata))
+        """.format(
+            auth_id=guestclient["auth_id"],
+        )))
+        auth_metadata = json.loads(auth_metadata)
+
+        self.assertGreaterEqual(auth_metadata["version"], expected_auth_metadata["version"])
+        del expected_auth_metadata["version"]
+        del auth_metadata["version"]
+        self.assertEqual(expected_auth_metadata, auth_metadata)
+
+        # Check that auth metadata file is cleaned up on removing
+        # auth ID's access to volumes 'volumeid'.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity=guestclient["auth_id"]
+        )))
+        self.assertNotIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Check that volume metadata file is cleaned up on volume deletion.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+        self.assertNotIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
+
+    def test_update_old_style_auth_metadata_to_new_during_authorize(self):
+        """
+        From nautilus onwards 'volumes' created by ceph_volume_client were
+        renamed and used as CephFS subvolumes accessed via the ceph-mgr
+        interface. Hence it makes sense to store the subvolume data in
+        auth-metadata file with 'subvolumes' key instead of 'volumes' key.
+        This test validates the transparent update of 'volumes' key to
+        'subvolumes' key in auth metadata file during authorize.
+        """
+        volumeclient_mount = self.mounts[1]
+        volumeclient_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        group_id = "groupid"
+        volume_id1 = "volumeid1"
+        volume_id2 = "volumeid2"
+
+        auth_id = "guest"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # Create a volume volumeid1.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 10*1024*1024)
+            print(create_result['mount_path'])
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id1,
+        )))
+
+        # Create a volume volumeid2.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 10*1024*1024)
+            print(create_result['mount_path'])
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id2,
+        )))
+
+        # Check that volume metadata file is created on volume creation.
+        vol_metadata_filename = "_{0}:{1}.meta".format(group_id, volume_id1)
+        self.assertIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
+        vol_metadata_filename2 = "_{0}:{1}.meta".format(group_id, volume_id2)
+        self.assertIn(vol_metadata_filename2, self.mounts[0].ls("volumes"))
+
+        # Authorize 'guestclient_1', using auth ID 'guest' and belonging to
+        # 'tenant1', with 'rw' access to the volume 'volumeid1'.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id1,
+            auth_id=guestclient_1["auth_id"],
+            tenant_id=guestclient_1["tenant_id"]
+        )))
+
+        # Check that auth metadata file for auth ID 'guest', is
+        # created on authorizing 'guest' access to the volume.
+        auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"])
+        self.assertIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Replace 'subvolumes' to 'volumes', old style auth-metadata file
+        self.mounts[0].run_shell(['sed', '-i', 's/subvolumes/volumes/g', 'volumes/{0}'.format(auth_metadata_filename)])
+
+        # Authorize 'guestclient_1', using auth ID 'guest' and belonging to
+        # 'tenant1', with 'rw' access to the volume 'volumeid2'.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id2,
+            auth_id=guestclient_1["auth_id"],
+            tenant_id=guestclient_1["tenant_id"]
+        )))
+
+        # Verify that the auth metadata file stores the tenant ID that the
+        # auth ID belongs to, the auth ID's authorized access levels
+        # for different volumes, versioning details, etc.
+        expected_auth_metadata = {
+            "version": 2,
+            "compat_version": 6,
+            "dirty": False,
+            "tenant_id": "tenant1",
+            "subvolumes": {
+                "groupid/volumeid1": {
+                    "dirty": False,
+                    "access_level": "rw"
+                },
+                "groupid/volumeid2": {
+                    "dirty": False,
+                    "access_level": "rw"
+                }
+            }
+        }
+
+        auth_metadata = self._volume_client_python(volumeclient_mount, dedent("""
+            import json
+            auth_metadata = vc._auth_metadata_get("{auth_id}")
+            print(json.dumps(auth_metadata))
+        """.format(
+            auth_id=guestclient_1["auth_id"],
+        )))
+        auth_metadata = json.loads(auth_metadata)
+
+        self.assertGreaterEqual(auth_metadata["version"], expected_auth_metadata["version"])
+        del expected_auth_metadata["version"]
+        del auth_metadata["version"]
+        self.assertEqual(expected_auth_metadata, auth_metadata)
+
+        # Check that auth metadata file is cleaned up on removing
+        # auth ID's access to volumes 'volumeid1' and 'volumeid2'.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id1,
+            guest_entity=guestclient_1["auth_id"]
+        )))
+
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id2,
+            guest_entity=guestclient_1["auth_id"]
+        )))
+        self.assertNotIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Check that volume metadata file is cleaned up on volume deletion.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id1,
+        )))
+        self.assertNotIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Check that volume metadata file is cleaned up on volume deletion.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id2,
+        )))
+        self.assertNotIn(vol_metadata_filename2, self.mounts[0].ls("volumes"))
+
+    def test_update_old_style_auth_metadata_to_new_during_deauthorize(self):
+        """
+        From nautilus onwards 'volumes' created by ceph_volume_client were
+        renamed and used as CephFS subvolumes accessed via the ceph-mgr
+        interface. Hence it makes sense to store the subvolume data in
+        auth-metadata file with 'subvolumes' key instead of 'volumes' key.
+        This test validates the transparent update of 'volumes' key to
+        'subvolumes' key in auth metadata file during de-authorize.
+        """
+        volumeclient_mount = self.mounts[1]
+        volumeclient_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        group_id = "groupid"
+        volume_id1 = "volumeid1"
+        volume_id2 = "volumeid2"
+
+        auth_id = "guest"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # Create a volume volumeid1.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 10*1024*1024)
+            print(create_result['mount_path'])
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id1,
+        )))
+
+        # Create a volume volumeid2.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 10*1024*1024)
+            print(create_result['mount_path'])
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id2,
+        )))
+
+        # Check that volume metadata file is created on volume creation.
+        vol_metadata_filename = "_{0}:{1}.meta".format(group_id, volume_id1)
+        self.assertIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
+        vol_metadata_filename2 = "_{0}:{1}.meta".format(group_id, volume_id2)
+        self.assertIn(vol_metadata_filename2, self.mounts[0].ls("volumes"))
+
+        # Authorize 'guestclient_1', using auth ID 'guest' and belonging to
+        # 'tenant1', with 'rw' access to the volume 'volumeid1'.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id1,
+            auth_id=guestclient_1["auth_id"],
+            tenant_id=guestclient_1["tenant_id"]
+        )))
+
+        # Authorize 'guestclient_1', using auth ID 'guest' and belonging to
+        # 'tenant1', with 'rw' access to the volume 'volumeid2'.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id2,
+            auth_id=guestclient_1["auth_id"],
+            tenant_id=guestclient_1["tenant_id"]
+        )))
+
+        # Check that auth metadata file for auth ID 'guest', is
+        # created on authorizing 'guest' access to the volume.
+        auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"])
+        self.assertIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Replace 'subvolumes' to 'volumes', old style auth-metadata file
+        self.mounts[0].run_shell(['sed', '-i', 's/subvolumes/volumes/g', 'volumes/{0}'.format(auth_metadata_filename)])
+
+        # Deauthorize 'guestclient_1' to access 'volumeid2'. This should update
+        # 'volumes' key to 'subvolumes'
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id2,
+            guest_entity=guestclient_1["auth_id"],
+        )))
+
+        # Verify that the auth metadata file stores the tenant ID that the
+        # auth ID belongs to, the auth ID's authorized access levels
+        # for different volumes, versioning details, etc.
+        expected_auth_metadata = {
+            "version": 2,
+            "compat_version": 6,
+            "dirty": False,
+            "tenant_id": "tenant1",
+            "subvolumes": {
+                "groupid/volumeid1": {
+                    "dirty": False,
+                    "access_level": "rw"
+                }
+            }
+        }
+
+        auth_metadata = self._volume_client_python(volumeclient_mount, dedent("""
+            import json
+            auth_metadata = vc._auth_metadata_get("{auth_id}")
+            print(json.dumps(auth_metadata))
+        """.format(
+            auth_id=guestclient_1["auth_id"],
+        )))
+        auth_metadata = json.loads(auth_metadata)
+
+        self.assertGreaterEqual(auth_metadata["version"], expected_auth_metadata["version"])
+        del expected_auth_metadata["version"]
+        del auth_metadata["version"]
+        self.assertEqual(expected_auth_metadata, auth_metadata)
+
+        # Check that auth metadata file is cleaned up on removing
+        # auth ID's access to volumes 'volumeid1' and 'volumeid2'
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id1,
+            guest_entity=guestclient_1["auth_id"]
+        )))
+        self.assertNotIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Check that volume metadata file is cleaned up on 'volumeid1' deletion.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id1,
+        )))
+        self.assertNotIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Check that volume metadata file is cleaned up on 'volumeid2' deletion.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id2,
+        )))
+        self.assertNotIn(vol_metadata_filename2, self.mounts[0].ls("volumes"))
+
+    def test_put_object(self):
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+        self._configure_vc_auth(vc_mount, "manila")
+
+        obj_data = 'test data'
+        obj_name = 'test_vc_obj_1'
+        pool_name = self.fs.get_data_pool_names()[0]
+
+        self._volume_client_python(vc_mount, dedent("""
+            vc.put_object("{pool_name}", "{obj_name}", b"{obj_data}")
+        """.format(
+            pool_name = pool_name,
+            obj_name = obj_name,
+            obj_data = obj_data
+        )))
+
+        read_data = self.fs.rados(['get', obj_name, '-'], pool=pool_name)
+        self.assertEqual(obj_data, read_data)
+
+    def test_get_object(self):
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+        self._configure_vc_auth(vc_mount, "manila")
+
+        obj_data = 'test_data'
+        obj_name = 'test_vc_ob_2'
+        pool_name = self.fs.get_data_pool_names()[0]
+
+        self.fs.rados(['put', obj_name, '-'], pool=pool_name, stdin_data=obj_data)
+
+        self._volume_client_python(vc_mount, dedent("""
+            data_read = vc.get_object("{pool_name}", "{obj_name}")
+            assert data_read == b"{obj_data}"
+        """.format(
+            pool_name = pool_name,
+            obj_name = obj_name,
+            obj_data = obj_data
+        )))
+
+    def test_put_object_versioned(self):
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+        self._configure_vc_auth(vc_mount, "manila")
+
+        obj_data = 'test_data'
+        obj_name = 'test_vc_obj'
+        pool_name = self.fs.get_data_pool_names()[0]
+        self.fs.rados(['put', obj_name, '-'], pool=pool_name, stdin_data=obj_data)
+
+        self._volume_client_python(vc_mount, dedent("""
+            data, version_before = vc.get_object_and_version("{pool_name}", "{obj_name}")
+
+            if sys_version_info.major < 3:
+                data = data + 'modification1'
+            elif sys_version_info.major > 3:
+                data = str.encode(data.decode() + 'modification1')
+
+            vc.put_object_versioned("{pool_name}", "{obj_name}", data, version_before)
+            data, version_after = vc.get_object_and_version("{pool_name}", "{obj_name}")
+            assert version_after == version_before + 1
+        """).format(pool_name=pool_name, obj_name=obj_name))
+
+    def test_version_check_for_put_object_versioned(self):
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+        self._configure_vc_auth(vc_mount, "manila")
+
+        obj_data = 'test_data'
+        obj_name = 'test_vc_ob_2'
+        pool_name = self.fs.get_data_pool_names()[0]
+        self.fs.rados(['put', obj_name, '-'], pool=pool_name, stdin_data=obj_data)
+
+        # Test if put_object_versioned() crosschecks the version of the
+        # given object. Being a negative test, an exception is expected.
+        expected_exception = 'rados_OSError'
+        output = self._volume_client_python(vc_mount, dedent("""
+            data, version = vc.get_object_and_version("{pool_name}", "{obj_name}")
+
+            if sys_version_info.major < 3:
+                data = data + 'm1'
+            elif sys_version_info.major > 3:
+                data = str.encode(data.decode('utf-8') + 'm1')
+
+            vc.put_object("{pool_name}", "{obj_name}", data)
+
+            if sys_version_info.major < 3:
+                data = data + 'm2'
+            elif sys_version_info.major > 3:
+                data = str.encode(data.decode('utf-8') + 'm2')
+
+            try:
+                vc.put_object_versioned("{pool_name}", "{obj_name}", data, version)
+            except {expected_exception}:
+                print('{expected_exception} raised')
+        """).format(pool_name=pool_name, obj_name=obj_name,
+                    expected_exception=expected_exception))
+        self.assertEqual(expected_exception + ' raised', output)
+
+
+    def test_delete_object(self):
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+        self._configure_vc_auth(vc_mount, "manila")
+
+        obj_data = 'test data'
+        obj_name = 'test_vc_obj_3'
+        pool_name = self.fs.get_data_pool_names()[0]
+
+        self.fs.rados(['put', obj_name, '-'], pool=pool_name, stdin_data=obj_data)
+
+        self._volume_client_python(vc_mount, dedent("""
+            data_read = vc.delete_object("{pool_name}", "{obj_name}")
+        """.format(
+            pool_name = pool_name,
+            obj_name = obj_name,
+        )))
+
+        with self.assertRaises(CommandFailedError):
+            self.fs.rados(['stat', obj_name], pool=pool_name)
+
+        # Check idempotency -- no error raised trying to delete non-existent
+        # object
+        self._volume_client_python(vc_mount, dedent("""
+            data_read = vc.delete_object("{pool_name}", "{obj_name}")
+        """.format(
+            pool_name = pool_name,
+            obj_name = obj_name,
+        )))
+
+    def test_21501(self):
+        """
+        Reproducer for #21501 "ceph_volume_client: sets invalid caps for
+        existing IDs with no caps" (http://tracker.ceph.com/issues/21501)
+        """
+
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+
+        # Configure vc_mount as the handle for driving volumeclient
+        self._configure_vc_auth(vc_mount, "manila")
+
+        # Create a volume
+        group_id = "grpid"
+        volume_id = "volid"
+        mount_path = self._volume_client_python(vc_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 1024*1024*10)
+            print(create_result['mount_path'])
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id
+        )))
+
+        # Create an auth ID with no caps
+        guest_id = '21501'
+        self.fs.mon_manager.raw_cluster_cmd_result(
+            'auth', 'get-or-create', 'client.{0}'.format(guest_id))
+
+        guest_mount = self.mounts[2]
+        guest_mount.umount_wait()
+
+        # Set auth caps for the auth ID using the volumeclient
+        self._configure_guest_auth(vc_mount, guest_mount, guest_id, mount_path, allow_existing_id=True)
+
+        # Mount the volume in the guest using the auth ID to assert that the
+        # auth caps are valid
+        guest_mount.mount(mount_path=mount_path)
+
+    def test_volume_without_namespace_isolation(self):
+        """
+        That volume client can create volumes that do not have separate RADOS
+        namespace layouts.
+        """
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+
+        # Configure vc_mount as the handle for driving volumeclient
+        self._configure_vc_auth(vc_mount, "manila")
+
+        # Create a volume
+        volume_prefix = "/myprefix"
+        group_id = "grpid"
+        volume_id = "volid"
+        self._volume_client_python(vc_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 1024*1024*10, namespace_isolated=False)
+            print(create_result['mount_path'])
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id
+        )), volume_prefix)
+
+        # The CephFS volume should be created
+        self.mounts[0].stat(os.path.join("myprefix", group_id, volume_id))
+        vol_namespace = self.mounts[0].getfattr(
+            os.path.join("myprefix", group_id, volume_id),
+            "ceph.dir.layout.pool_namespace")
+        assert not vol_namespace
+
+        self._volume_client_python(vc_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+            vc.purge_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )), volume_prefix)
diff --git a/qa/tasks/cephfs/test_volumes.py b/qa/tasks/cephfs/test_volumes.py
new file mode 100644
index 00000000..67f138f8
--- /dev/null
+++ b/qa/tasks/cephfs/test_volumes.py
@@ -0,0 +1,4435 @@
+import os
+import json
+import time
+import errno
+import random
+import logging
+import collections
+import uuid
+import unittest
+from hashlib import md5
+from textwrap import dedent
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.exceptions import CommandFailedError
+from teuthology.misc import sudo_write_file
+
+log = logging.getLogger(__name__)
+
+class TestVolumes(CephFSTestCase):
+    TEST_VOLUME_PREFIX = "volume"
+    TEST_SUBVOLUME_PREFIX="subvolume"
+    TEST_GROUP_PREFIX="group"
+    TEST_SNAPSHOT_PREFIX="snapshot"
+    TEST_CLONE_PREFIX="clone"
+    TEST_FILE_NAME_PREFIX="subvolume_file"
+
+    # for filling subvolume with data
+    CLIENTS_REQUIRED = 2
+
+    # io defaults
+    DEFAULT_FILE_SIZE = 1 # MB
+    DEFAULT_NUMBER_OF_FILES = 1024
+
+    def _fs_cmd(self, *args):
+        return self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", *args)
+
+    def _raw_cmd(self, *args):
+        return self.mgr_cluster.mon_manager.raw_cluster_cmd(*args)
+
+    def __check_clone_state(self, state, clone, clone_group=None, timo=120):
+        check = 0
+        args = ["clone", "status", self.volname, clone]
+        if clone_group:
+            args.append(clone_group)
+        args = tuple(args)
+        while check < timo:
+            result = json.loads(self._fs_cmd(*args))
+            if result["status"]["state"] == state:
+                break
+            check += 1
+            time.sleep(1)
+        self.assertTrue(check < timo)
+
+    def _wait_for_clone_to_complete(self, clone, clone_group=None, timo=120):
+        self.__check_clone_state("complete", clone, clone_group, timo)
+
+    def _wait_for_clone_to_fail(self, clone, clone_group=None, timo=120):
+        self.__check_clone_state("failed", clone, clone_group, timo)
+
+    def _check_clone_canceled(self, clone, clone_group=None):
+        self.__check_clone_state("canceled", clone, clone_group, timo=1)
+
+    def _get_subvolume_snapshot_path(self, subvolume, snapshot, source_group, subvol_path, source_version):
+        if source_version == 2:
+            # v2
+            if subvol_path is not None:
+                (base_path, uuid_str) = os.path.split(subvol_path)
+            else:
+                (base_path, uuid_str) = os.path.split(self._get_subvolume_path(self.volname, subvolume, group_name=source_group))
+            return os.path.join(base_path, ".snap", snapshot, uuid_str)
+
+        # v1
+        base_path = self._get_subvolume_path(self.volname, subvolume, group_name=source_group)
+        return os.path.join(base_path, ".snap", snapshot)
+
+    def _verify_clone_attrs(self, source_path, clone_path):
+        path1 = source_path
+        path2 = clone_path
+
+        p = self.mount_a.run_shell(["find", path1])
+        paths = p.stdout.getvalue().strip().split()
+
+        # for each entry in source and clone (sink) verify certain inode attributes:
+        # inode type, mode, ownership, [am]time.
+        for source_path in paths:
+            sink_entry = source_path[len(path1)+1:]
+            sink_path = os.path.join(path2, sink_entry)
+
+            # mode+type
+            sval = int(self.mount_a.run_shell(['stat', '-c' '%f', source_path]).stdout.getvalue().strip(), 16)
+            cval = int(self.mount_a.run_shell(['stat', '-c' '%f', sink_path]).stdout.getvalue().strip(), 16)
+            self.assertEqual(sval, cval)
+
+            # ownership
+            sval = int(self.mount_a.run_shell(['stat', '-c' '%u', source_path]).stdout.getvalue().strip())
+            cval = int(self.mount_a.run_shell(['stat', '-c' '%u', sink_path]).stdout.getvalue().strip())
+            self.assertEqual(sval, cval)
+
+            sval = int(self.mount_a.run_shell(['stat', '-c' '%g', source_path]).stdout.getvalue().strip())
+            cval = int(self.mount_a.run_shell(['stat', '-c' '%g', sink_path]).stdout.getvalue().strip())
+            self.assertEqual(sval, cval)
+
+            # inode timestamps
+            sval = int(self.mount_a.run_shell(['stat', '-c' '%X', source_path]).stdout.getvalue().strip())
+            cval = int(self.mount_a.run_shell(['stat', '-c' '%X', sink_path]).stdout.getvalue().strip())
+            self.assertEqual(sval, cval)
+
+            sval = int(self.mount_a.run_shell(['stat', '-c' '%Y', source_path]).stdout.getvalue().strip())
+            cval = int(self.mount_a.run_shell(['stat', '-c' '%Y', sink_path]).stdout.getvalue().strip())
+            self.assertEqual(sval, cval)
+
+    def _verify_clone_root(self, source_path, clone_path, clone, clone_group, clone_pool):
+        # verifies following clone root attrs quota, data_pool and pool_namespace
+        # remaining attributes of clone root are validated in _verify_clone_attrs
+
+        clone_info = json.loads(self._get_subvolume_info(self.volname, clone, clone_group))
+
+        # verify quota is inherited from source snapshot
+        src_quota = self.mount_a.getfattr(source_path, "ceph.quota.max_bytes")
+        self.assertEqual(clone_info["bytes_quota"], "infinite" if src_quota is None else int(src_quota))
+
+        if clone_pool:
+            # verify pool is set as per request
+            self.assertEqual(clone_info["data_pool"], clone_pool)
+        else:
+            # verify pool and pool namespace are inherited from snapshot
+            self.assertEqual(clone_info["data_pool"],
+                             self.mount_a.getfattr(source_path, "ceph.dir.layout.pool"))
+            self.assertEqual(clone_info["pool_namespace"],
+                             self.mount_a.getfattr(source_path, "ceph.dir.layout.pool_namespace"))
+
+    def _verify_clone(self, subvolume, snapshot, clone,
+                      source_group=None, clone_group=None, clone_pool=None,
+                      subvol_path=None, source_version=2, timo=120):
+        # pass in subvol_path (subvolume path when snapshot was taken) when subvolume is removed
+        # but snapshots are retained for clone verification
+        path1 = self._get_subvolume_snapshot_path(subvolume, snapshot, source_group, subvol_path, source_version)
+        path2 = self._get_subvolume_path(self.volname, clone, group_name=clone_group)
+
+        check = 0
+        # TODO: currently snapshot rentries are not stable if snapshot source entries
+        #       are removed, https://tracker.ceph.com/issues/46747
+        while check < timo and subvol_path is None:
+            val1 = int(self.mount_a.getfattr(path1, "ceph.dir.rentries"))
+            val2 = int(self.mount_a.getfattr(path2, "ceph.dir.rentries"))
+            if val1 == val2:
+                break
+            check += 1
+            time.sleep(1)
+        self.assertTrue(check < timo)
+
+        self._verify_clone_root(path1, path2, clone, clone_group, clone_pool)
+        self._verify_clone_attrs(path1, path2)
+
+    def _generate_random_volume_name(self, count=1):
+        n = self.volume_start
+        volumes = [f"{TestVolumes.TEST_VOLUME_PREFIX}_{i:016}" for i in range(n, n+count)]
+        self.volume_start += count
+        return volumes[0] if count == 1 else volumes
+
+    def _generate_random_subvolume_name(self, count=1):
+        n = self.subvolume_start
+        subvolumes = [f"{TestVolumes.TEST_SUBVOLUME_PREFIX}_{i:016}" for i in range(n, n+count)]
+        self.subvolume_start += count
+        return subvolumes[0] if count == 1 else subvolumes
+
+    def _generate_random_group_name(self, count=1):
+        n = self.group_start
+        groups = [f"{TestVolumes.TEST_GROUP_PREFIX}_{i:016}" for i in range(n, n+count)]
+        self.group_start += count
+        return groups[0] if count == 1 else groups
+
+    def _generate_random_snapshot_name(self, count=1):
+        n = self.snapshot_start
+        snaps = [f"{TestVolumes.TEST_SNAPSHOT_PREFIX}_{i:016}" for i in range(n, n+count)]
+        self.snapshot_start += count
+        return snaps[0] if count == 1 else snaps
+
+    def _generate_random_clone_name(self, count=1):
+        n = self.clone_start
+        clones = [f"{TestVolumes.TEST_CLONE_PREFIX}_{i:016}" for i in range(n, n+count)]
+        self.clone_start += count
+        return clones[0] if count == 1 else clones
+
+    def _enable_multi_fs(self):
+        self._fs_cmd("flag", "set", "enable_multiple", "true", "--yes-i-really-mean-it")
+
+    def _create_or_reuse_test_volume(self):
+        result = json.loads(self._fs_cmd("volume", "ls"))
+        if len(result) == 0:
+            self.vol_created = True
+            self.volname = self._generate_random_volume_name()
+            self._fs_cmd("volume", "create", self.volname)
+        else:
+            self.volname = result[0]['name']
+
+    def  _get_subvolume_group_path(self, vol_name, group_name):
+        args = ("subvolumegroup", "getpath", vol_name, group_name)
+        path = self._fs_cmd(*args)
+        # remove the leading '/', and trailing whitespaces
+        return path[1:].rstrip()
+
+    def  _get_subvolume_path(self, vol_name, subvol_name, group_name=None):
+        args = ["subvolume", "getpath", vol_name, subvol_name]
+        if group_name:
+            args.append(group_name)
+        args = tuple(args)
+        path = self._fs_cmd(*args)
+        # remove the leading '/', and trailing whitespaces
+        return path[1:].rstrip()
+
+    def  _get_subvolume_info(self, vol_name, subvol_name, group_name=None):
+        args = ["subvolume", "info", vol_name, subvol_name]
+        if group_name:
+            args.append(group_name)
+        args = tuple(args)
+        subvol_md = self._fs_cmd(*args)
+        return subvol_md
+
+    def _get_subvolume_snapshot_info(self, vol_name, subvol_name, snapname, group_name=None):
+        args = ["subvolume", "snapshot", "info", vol_name, subvol_name, snapname]
+        if group_name:
+            args.append(group_name)
+        args = tuple(args)
+        snap_md = self._fs_cmd(*args)
+        return snap_md
+
+    def _delete_test_volume(self):
+        self._fs_cmd("volume", "rm", self.volname, "--yes-i-really-mean-it")
+
+    def _do_subvolume_pool_and_namespace_update(self, subvolume, pool=None, pool_namespace=None, subvolume_group=None):
+        subvolpath = self._get_subvolume_path(self.volname, subvolume, group_name=subvolume_group)
+
+        if pool is not None:
+            self.mount_a.setfattr(subvolpath, 'ceph.dir.layout.pool', pool)
+
+        if pool_namespace is not None:
+            self.mount_a.setfattr(subvolpath, 'ceph.dir.layout.pool_namespace', pool_namespace)
+
+    def _do_subvolume_attr_update(self, subvolume, uid, gid, mode, subvolume_group=None):
+        subvolpath = self._get_subvolume_path(self.volname, subvolume, group_name=subvolume_group)
+
+        # mode
+        self.mount_a.run_shell(['chmod', mode, subvolpath])
+
+        # ownership
+        self.mount_a.run_shell(['chown', uid, subvolpath])
+        self.mount_a.run_shell(['chgrp', gid, subvolpath])
+
+    def _do_subvolume_io(self, subvolume, subvolume_group=None, create_dir=None,
+                         number_of_files=DEFAULT_NUMBER_OF_FILES, file_size=DEFAULT_FILE_SIZE):
+        # get subvolume path for IO
+        args = ["subvolume", "getpath", self.volname, subvolume]
+        if subvolume_group:
+            args.append(subvolume_group)
+        args = tuple(args)
+        subvolpath = self._fs_cmd(*args)
+        self.assertNotEqual(subvolpath, None)
+        subvolpath = subvolpath[1:].rstrip() # remove "/" prefix and any trailing newline
+
+        io_path = subvolpath
+        if create_dir:
+            io_path = os.path.join(subvolpath, create_dir)
+            self.mount_a.run_shell(["mkdir", "-p", io_path])
+
+        log.debug("filling subvolume {0} with {1} files each {2}MB size under directory {3}".format(subvolume, number_of_files, file_size, io_path))
+        for i in range(number_of_files):
+            filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, i)
+            self.mount_a.write_n_mb(os.path.join(io_path, filename), file_size)
+
+    def _do_subvolume_io_mixed(self, subvolume, subvolume_group=None):
+        subvolpath = self._get_subvolume_path(self.volname, subvolume, group_name=subvolume_group)
+
+        reg_file = "regfile.0"
+        reg_path = os.path.join(subvolpath, reg_file)
+        dir_path = os.path.join(subvolpath, "dir.0")
+        sym_path1 = os.path.join(subvolpath, "sym.0")
+        # this symlink's ownership would be changed
+        sym_path2 = os.path.join(dir_path, "sym.0")
+
+        #self.mount_a.write_n_mb(reg_path, TestVolumes.DEFAULT_FILE_SIZE)
+        self.mount_a.run_shell(["sudo", "mkdir", dir_path], omit_sudo=False)
+        self.mount_a.run_shell(["sudo", "ln", "-s", "./{}".format(reg_file), sym_path1], omit_sudo=False)
+        self.mount_a.run_shell(["sudo", "ln", "-s", "./{}".format(reg_file), sym_path2], omit_sudo=False)
+        # flip ownership to nobody. assumption: nobody's id is 65534
+        self.mount_a.run_shell(["sudo", "chown", "-h", "65534:65534", sym_path2], omit_sudo=False)
+
+    def _wait_for_trash_empty(self, timeout=30):
+        # XXX: construct the trash dir path (note that there is no mgr
+        # [sub]volume interface for this).
+        trashdir = os.path.join("./", "volumes", "_deleting")
+        self.mount_a.wait_for_dir_empty(trashdir, timeout=timeout)
+
+    def _assert_meta_location_and_version(self, vol_name, subvol_name, subvol_group=None, version=2, legacy=False):
+        if legacy:
+            subvol_path = self._get_subvolume_path(vol_name, subvol_name, group_name=subvol_group)
+            m = md5()
+            m.update(("/"+subvol_path).encode('utf-8'))
+            meta_filename = "{0}.meta".format(m.digest().hex())
+            metapath = os.path.join(".", "volumes", "_legacy", meta_filename)
+        else:
+            group = subvol_group if subvol_group is not None else '_nogroup'
+            metapath = os.path.join(".", "volumes", group, subvol_name, ".meta")
+
+        out = self.mount_a.run_shell(['cat', metapath])
+        lines = out.stdout.getvalue().strip().split('\n')
+        sv_version = -1
+        for line in lines:
+            if line == "version = " + str(version):
+                sv_version = version
+                break
+        self.assertEqual(sv_version, version, "version expected was '{0}' but got '{1}' from meta file at '{2}'".format(
+                         version, sv_version, metapath))
+
+    def _create_v1_subvolume(self, subvol_name, subvol_group=None, has_snapshot=True, subvol_type='subvolume', state='complete'):
+        group = subvol_group if subvol_group is not None else '_nogroup'
+        basepath = os.path.join("volumes", group, subvol_name)
+        uuid_str = str(uuid.uuid4())
+        createpath = os.path.join(basepath, uuid_str)
+        self.mount_a.run_shell(['mkdir', '-p', createpath])
+
+        # create a v1 snapshot, to prevent auto upgrades
+        if has_snapshot:
+            snappath = os.path.join(createpath, ".snap", "fake")
+            self.mount_a.run_shell(['mkdir', '-p', snappath])
+
+        # add required xattrs to subvolume
+        default_pool = self.mount_a.getfattr(".", "ceph.dir.layout.pool")
+        self.mount_a.setfattr(createpath, 'ceph.dir.layout.pool', default_pool)
+
+        # create a v1 .meta file
+        meta_contents = "[GLOBAL]\nversion = 1\ntype = {0}\npath = {1}\nstate = {2}\n".format(subvol_type, "/" + createpath, state)
+        if state == 'pending':
+            # add a fake clone source
+            meta_contents = meta_contents + '[source]\nvolume = fake\nsubvolume = fake\nsnapshot = fake\n'
+        meta_filepath1 = os.path.join(self.mount_a.mountpoint, basepath, ".meta")
+        sudo_write_file(self.mount_a.client_remote, meta_filepath1, meta_contents)
+        return createpath
+
+    def _update_fake_trash(self, subvol_name, subvol_group=None, trash_name='fake', create=True):
+        group = subvol_group if subvol_group is not None else '_nogroup'
+        trashpath = os.path.join("volumes", group, subvol_name, '.trash', trash_name)
+        if create:
+            self.mount_a.run_shell(['mkdir', '-p', trashpath])
+        else:
+            self.mount_a.run_shell(['rmdir', trashpath])
+
+    def _configure_guest_auth(self, guest_mount, authid, key):
+        """
+        Set up auth credentials for a guest client.
+        """
+        # Create keyring file for the guest client.
+        keyring_txt = dedent("""
+        [client.{authid}]
+            key = {key}
+
+        """.format(authid=authid,key=key))
+
+        guest_mount.client_id = authid
+        guest_mount.client_remote.write_file(guest_mount.get_keyring_path(),
+                                             keyring_txt, sudo=True)
+        # Add a guest client section to the ceph config file.
+        self.config_set("client.{0}".format(authid), "debug client", 20)
+        self.config_set("client.{0}".format(authid), "debug objecter", 20)
+        self.set_conf("client.{0}".format(authid),
+                      "keyring", guest_mount.get_keyring_path())
+
+    def _auth_metadata_get(self, filedata):
+        """
+        Return a deserialized JSON object, or None
+        """
+        try:
+            data = json.loads(filedata)
+        except json.decoder.JSONDecodeError:
+            data = None
+        return data
+
+    def setUp(self):
+        super(TestVolumes, self).setUp()
+        self.volname = None
+        self.vol_created = False
+        self._enable_multi_fs()
+        self._create_or_reuse_test_volume()
+        self.config_set('mon', 'mon_allow_pool_delete', True)
+        self.volume_start = random.randint(1, (1<<20))
+        self.subvolume_start = random.randint(1, (1<<20))
+        self.group_start = random.randint(1, (1<<20))
+        self.snapshot_start = random.randint(1, (1<<20))
+        self.clone_start = random.randint(1, (1<<20))
+
+    def tearDown(self):
+        if self.vol_created:
+            self._delete_test_volume()
+        super(TestVolumes, self).tearDown()
+
+    def test_connection_expiration(self):
+        # unmount any cephfs mounts
+        for i in range(0, self.CLIENTS_REQUIRED):
+            self.mounts[i].umount_wait()
+        sessions = self._session_list()
+        self.assertLessEqual(len(sessions), 1) # maybe mgr is already mounted
+
+        # Get the mgr to definitely mount cephfs
+        subvolume = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+        sessions = self._session_list()
+        self.assertEqual(len(sessions), 1)
+
+        # Now wait for the mgr to expire the connection:
+        self.wait_until_evicted(sessions[0]['id'], timeout=90)
+
+    def test_volume_create(self):
+        """
+        That the volume can be created and then cleans up
+        """
+        volname = self._generate_random_volume_name()
+        self._fs_cmd("volume", "create", volname)
+        volumels = json.loads(self._fs_cmd("volume", "ls"))
+
+        if not (volname in ([volume['name'] for volume in volumels])):
+            raise RuntimeError("Error creating volume '{0}'".format(volname))
+        else:
+            # clean up
+            self._fs_cmd("volume", "rm", volname, "--yes-i-really-mean-it")
+
+    def test_volume_ls(self):
+        """
+        That the existing and the newly created volumes can be listed and
+        finally cleans up.
+        """
+        vls = json.loads(self._fs_cmd("volume", "ls"))
+        volumes = [volume['name'] for volume in vls]
+
+        #create new volumes and add it to the existing list of volumes
+        volumenames = self._generate_random_volume_name(2)
+        for volumename in volumenames:
+            self._fs_cmd("volume", "create", volumename)
+        volumes.extend(volumenames)
+
+        # list volumes
+        try:
+            volumels = json.loads(self._fs_cmd('volume', 'ls'))
+            if len(volumels) == 0:
+                raise RuntimeError("Expected the 'fs volume ls' command to list the created volumes.")
+            else:
+                volnames = [volume['name'] for volume in volumels]
+                if collections.Counter(volnames) != collections.Counter(volumes):
+                    raise RuntimeError("Error creating or listing volumes")
+        finally:
+            # clean up
+            for volume in volumenames:
+                self._fs_cmd("volume", "rm", volume, "--yes-i-really-mean-it")
+
+    def test_volume_rm(self):
+        """
+        That the volume can only be removed when --yes-i-really-mean-it is used
+        and verify that the deleted volume is not listed anymore.
+        """
+        for m in self.mounts:
+            m.umount_wait()
+        try:
+            self._fs_cmd("volume", "rm", self.volname)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EPERM:
+                raise RuntimeError("expected the 'fs volume rm' command to fail with EPERM, "
+                                   "but it failed with {0}".format(ce.exitstatus))
+            else:
+                self._fs_cmd("volume", "rm", self.volname, "--yes-i-really-mean-it")
+
+                #check if it's gone
+                volumes = json.loads(self._fs_cmd("volume", "ls", "--format=json-pretty"))
+                if (self.volname in [volume['name'] for volume in volumes]):
+                    raise RuntimeError("Expected the 'fs volume rm' command to succeed. "
+                                       "The volume {0} not removed.".format(self.volname))
+        else:
+            raise RuntimeError("expected the 'fs volume rm' command to fail.")
+
+    def test_subvolume_marked(self):
+        """
+        ensure a subvolume is marked with the ceph.dir.subvolume xattr
+        """
+        subvolume = self._generate_random_subvolume_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # getpath
+        subvolpath = self._get_subvolume_path(self.volname, subvolume)
+
+        # subdirectory of a subvolume cannot be moved outside the subvolume once marked with
+        # the xattr ceph.dir.subvolume, hence test by attempting to rename subvol path (incarnation)
+        # outside the subvolume
+        dstpath = os.path.join(self.mount_a.mountpoint, 'volumes', '_nogroup', 'new_subvol_location')
+        srcpath = os.path.join(self.mount_a.mountpoint, subvolpath)
+        rename_script = dedent("""
+            import os
+            import errno
+            try:
+                os.rename("{src}", "{dst}")
+            except OSError as e:
+                if e.errno != errno.EXDEV:
+                    raise RuntimeError("invalid error code on renaming subvolume incarnation out of subvolume directory")
+            else:
+                raise RuntimeError("expected renaming subvolume incarnation out of subvolume directory to fail")
+            """)
+        self.mount_a.run_python(rename_script.format(src=srcpath, dst=dstpath))
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_volume_rm_arbitrary_pool_removal(self):
+        """
+        That the arbitrary pool added to the volume out of band is removed
+        successfully on volume removal.
+        """
+        for m in self.mounts:
+            m.umount_wait()
+        new_pool = "new_pool"
+        # add arbitrary data pool
+        self.fs.add_data_pool(new_pool)
+        self._fs_cmd("volume", "rm", self.volname, "--yes-i-really-mean-it")
+
+        #check if fs is gone
+        volumes = json.loads(self._fs_cmd("volume", "ls", "--format=json-pretty"))
+        volnames = [volume['name'] for volume in volumes]
+        self.assertNotIn(self.volname, volnames)
+
+        #check if osd pools are gone
+        pools = json.loads(self._raw_cmd("osd", "pool", "ls", "detail", "--format=json-pretty"))
+        for pool in pools:
+            self.assertNotIn(self.volname, pool["application_metadata"].keys())
+
+    def test_volume_rm_when_mon_delete_pool_false(self):
+        """
+        That the volume can only be removed when mon_allowd_pool_delete is set
+        to true and verify that the pools are removed after volume deletion.
+        """
+        for m in self.mounts:
+            m.umount_wait()
+        self.config_set('mon', 'mon_allow_pool_delete', False)
+        try:
+            self._fs_cmd("volume", "rm", self.volname, "--yes-i-really-mean-it")
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EPERM,
+                             "expected the 'fs volume rm' command to fail with EPERM, "
+                             "but it failed with {0}".format(ce.exitstatus))
+        self.config_set('mon', 'mon_allow_pool_delete', True)
+        self._fs_cmd("volume", "rm", self.volname, "--yes-i-really-mean-it")
+
+        #check if fs is gone
+        volumes = json.loads(self._fs_cmd("volume", "ls", "--format=json-pretty"))
+        volnames = [volume['name'] for volume in volumes]
+        self.assertNotIn(self.volname, volnames,
+                         "volume {0} exists after removal".format(self.volname))
+        #check if pools are gone
+        pools = json.loads(self._raw_cmd("osd", "pool", "ls", "detail", "--format=json-pretty"))
+        for pool in pools:
+            self.assertNotIn(self.volname, pool["application_metadata"].keys(),
+                             "pool {0} exists after volume removal".format(pool["pool_name"]))
+
+    ### basic subvolume operations
+
+    def test_subvolume_create_and_rm(self):
+        # create subvolume
+        subvolume = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # make sure it exists
+        subvolpath = self._fs_cmd("subvolume", "getpath", self.volname, subvolume)
+        self.assertNotEqual(subvolpath, None)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        # make sure its gone
+        try:
+            self._fs_cmd("subvolume", "getpath", self.volname, subvolume)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOENT:
+                raise
+        else:
+            raise RuntimeError("expected the 'fs subvolume getpath' command to fail. Subvolume not removed.")
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_expand(self):
+        """
+        That a subvolume can be expanded in size and its quota matches the expected size.
+        """
+
+        # create subvolume
+        subvolname = self._generate_random_subvolume_name()
+        osize = self.DEFAULT_FILE_SIZE*1024*1024
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", str(osize))
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname)
+        self.assertNotEqual(subvolpath, None)
+
+        # expand the subvolume
+        nsize = osize*2
+        self._fs_cmd("subvolume", "resize", self.volname, subvolname, str(nsize))
+
+        # verify the quota
+        size = int(self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes"))
+        self.assertEqual(size, nsize)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_shrink(self):
+        """
+        That a subvolume can be shrinked in size and its quota matches the expected size.
+        """
+
+        # create subvolume
+        subvolname = self._generate_random_subvolume_name()
+        osize = self.DEFAULT_FILE_SIZE*1024*1024
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", str(osize))
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname)
+        self.assertNotEqual(subvolpath, None)
+
+        # shrink the subvolume
+        nsize = osize // 2
+        self._fs_cmd("subvolume", "resize", self.volname, subvolname, str(nsize))
+
+        # verify the quota
+        size = int(self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes"))
+        self.assertEqual(size, nsize)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_resize_fail_invalid_size(self):
+        """
+        That a subvolume cannot be resized to an invalid size and the quota did not change
+        """
+
+        osize = self.DEFAULT_FILE_SIZE*1024*1024
+        # create subvolume
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", str(osize))
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname)
+        self.assertNotEqual(subvolpath, None)
+
+        # try to resize the subvolume with an invalid size -10
+        nsize = -10
+        try:
+            self._fs_cmd("subvolume", "resize", self.volname, subvolname, str(nsize))
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on resize of subvolume with invalid size")
+        else:
+            self.fail("expected the 'fs subvolume resize' command to fail")
+
+        # verify the quota did not change
+        size = int(self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes"))
+        self.assertEqual(size, osize)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_resize_fail_zero_size(self):
+        """
+        That a subvolume cannot be resized to a zero size and the quota did not change
+        """
+
+        osize = self.DEFAULT_FILE_SIZE*1024*1024
+        # create subvolume
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", str(osize))
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname)
+        self.assertNotEqual(subvolpath, None)
+
+        # try to resize the subvolume with size 0
+        nsize = 0
+        try:
+            self._fs_cmd("subvolume", "resize", self.volname, subvolname, str(nsize))
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on resize of subvolume with invalid size")
+        else:
+            self.fail("expected the 'fs subvolume resize' command to fail")
+
+        # verify the quota did not change
+        size = int(self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes"))
+        self.assertEqual(size, osize)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_resize_quota_lt_used_size(self):
+        """
+        That a subvolume can be resized to a size smaller than the current used size
+        and the resulting quota matches the expected size.
+        """
+
+        osize = self.DEFAULT_FILE_SIZE*1024*1024*20
+        # create subvolume
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", str(osize))
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname)
+        self.assertNotEqual(subvolpath, None)
+
+        # create one file of 10MB
+        file_size=self.DEFAULT_FILE_SIZE*10
+        number_of_files=1
+        log.debug("filling subvolume {0} with {1} file of size {2}MB".format(subvolname,
+                                                                             number_of_files,
+                                                                             file_size))
+        filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, self.DEFAULT_NUMBER_OF_FILES+1)
+        self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size)
+
+        usedsize = int(self.mount_a.getfattr(subvolpath, "ceph.dir.rbytes"))
+        susedsize = int(self.mount_a.run_shell(['stat', '-c' '%s', subvolpath]).stdout.getvalue().strip())
+        self.assertEqual(usedsize, susedsize)
+
+        # shrink the subvolume
+        nsize = usedsize // 2
+        try:
+            self._fs_cmd("subvolume", "resize", self.volname, subvolname, str(nsize))
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume resize' command to succeed")
+
+        # verify the quota
+        size = int(self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes"))
+        self.assertEqual(size, nsize)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+
+    def test_subvolume_resize_fail_quota_lt_used_size_no_shrink(self):
+        """
+        That a subvolume cannot be resized to a size smaller than the current used size
+        when --no_shrink is given and the quota did not change.
+        """
+
+        osize = self.DEFAULT_FILE_SIZE*1024*1024*20
+        # create subvolume
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", str(osize))
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname)
+        self.assertNotEqual(subvolpath, None)
+
+        # create one file of 10MB
+        file_size=self.DEFAULT_FILE_SIZE*10
+        number_of_files=1
+        log.debug("filling subvolume {0} with {1} file of size {2}MB".format(subvolname,
+                                                                             number_of_files,
+                                                                             file_size))
+        filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, self.DEFAULT_NUMBER_OF_FILES+2)
+        self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size)
+
+        usedsize = int(self.mount_a.getfattr(subvolpath, "ceph.dir.rbytes"))
+        susedsize = int(self.mount_a.run_shell(['stat', '-c' '%s', subvolpath]).stdout.getvalue().strip())
+        self.assertEqual(usedsize, susedsize)
+
+        # shrink the subvolume
+        nsize = usedsize // 2
+        try:
+            self._fs_cmd("subvolume", "resize", self.volname, subvolname, str(nsize), "--no_shrink")
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on resize of subvolume with invalid size")
+        else:
+            self.fail("expected the 'fs subvolume resize' command to fail")
+
+        # verify the quota did not change
+        size = int(self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes"))
+        self.assertEqual(size, osize)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_resize_expand_on_full_subvolume(self):
+        """
+        That the subvolume can be expanded from a full subvolume and future writes succeed.
+        """
+
+        osize = self.DEFAULT_FILE_SIZE*1024*1024*10
+        # create subvolume of quota 10MB and make sure it exists
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", str(osize))
+        subvolpath = self._get_subvolume_path(self.volname, subvolname)
+        self.assertNotEqual(subvolpath, None)
+
+        # create one file of size 10MB and write
+        file_size=self.DEFAULT_FILE_SIZE*10
+        number_of_files=1
+        log.debug("filling subvolume {0} with {1} file of size {2}MB".format(subvolname,
+                                                                             number_of_files,
+                                                                             file_size))
+        filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, self.DEFAULT_NUMBER_OF_FILES+3)
+        self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size)
+
+        # create a file of size 5MB and try write more
+        file_size=file_size // 2
+        number_of_files=1
+        log.debug("filling subvolume {0} with {1} file of size {2}MB".format(subvolname,
+                                                                             number_of_files,
+                                                                             file_size))
+        filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, self.DEFAULT_NUMBER_OF_FILES+4)
+        try:
+            self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size)
+        except CommandFailedError:
+            # Not able to write. So expand the subvolume more and try writing the 5MB file again
+            nsize = osize*2
+            self._fs_cmd("subvolume", "resize", self.volname, subvolname, str(nsize))
+            try:
+                self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size)
+            except CommandFailedError:
+                self.fail("expected filling subvolume {0} with {1} file of size {2}MB"
+                                   "to succeed".format(subvolname, number_of_files, file_size))
+        else:
+            self.fail("expected filling subvolume {0} with {1} file of size {2}MB"
+                               "to fail".format(subvolname, number_of_files, file_size))
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_create_idempotence(self):
+        # create subvolume
+        subvolume = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # try creating w/ same subvolume name -- should be idempotent
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_create_idempotence_resize(self):
+        # create subvolume
+        subvolume = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # try creating w/ same subvolume name with size -- should set quota
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "1000000000")
+
+        # get subvolume metadata
+        subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume))
+        self.assertEqual(subvol_info["bytes_quota"], 1000000000)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_create_isolated_namespace(self):
+        """
+        Create subvolume in separate rados namespace
+        """
+
+        # create subvolume
+        subvolume = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--namespace-isolated")
+
+        # get subvolume metadata
+        subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume))
+        self.assertNotEqual(len(subvol_info), 0)
+        self.assertEqual(subvol_info["pool_namespace"], "fsvolumens_" + subvolume)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_create_with_invalid_data_pool_layout(self):
+        subvolume = self._generate_random_subvolume_name()
+        data_pool = "invalid_pool"
+        # create subvolume with invalid data pool layout
+        try:
+            self._fs_cmd("subvolume", "create", self.volname, subvolume, "--pool_layout", data_pool)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on create of subvolume with invalid pool layout")
+        else:
+            self.fail("expected the 'fs subvolume create' command to fail")
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_rm_force(self):
+        # test removing non-existing subvolume with --force
+        subvolume = self._generate_random_subvolume_name()
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--force")
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume rm --force' command to succeed")
+
+    def test_subvolume_create_with_auto_cleanup_on_fail(self):
+        subvolume = self._generate_random_subvolume_name()
+        data_pool = "invalid_pool"
+        # create subvolume with invalid data pool layout fails
+        with self.assertRaises(CommandFailedError):
+            self._fs_cmd("subvolume", "create", self.volname, subvolume, "--pool_layout", data_pool)
+
+        # check whether subvol path is cleaned up
+        try:
+            self._fs_cmd("subvolume", "getpath", self.volname, subvolume)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on getpath of non-existent subvolume")
+        else:
+            self.fail("expected the 'fs subvolume getpath' command to fail")
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_create_with_invalid_size(self):
+        # create subvolume with an invalid size -1
+        subvolume = self._generate_random_subvolume_name()
+        try:
+            self._fs_cmd("subvolume", "create", self.volname, subvolume, "--size", "-1")
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on create of subvolume with invalid size")
+        else:
+            self.fail("expected the 'fs subvolume create' command to fail")
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_nonexistent_subvolume_rm(self):
+        # remove non-existing subvolume
+        subvolume = "non_existent_subvolume"
+
+        # try, remove subvolume
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOENT:
+                raise
+        else:
+            raise RuntimeError("expected the 'fs subvolume rm' command to fail")
+
+    def test_nonexistent_subvolume_group_create(self):
+        subvolume = self._generate_random_subvolume_name()
+        group = "non_existent_group"
+
+        # try, creating subvolume in a nonexistent group
+        try:
+            self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOENT:
+                raise
+        else:
+            raise RuntimeError("expected the 'fs subvolume create' command to fail")
+
+    def test_default_uid_gid_subvolume(self):
+        subvolume = self._generate_random_subvolume_name()
+        expected_uid = 0
+        expected_gid = 0
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+        subvol_path = self._get_subvolume_path(self.volname, subvolume)
+
+        # check subvolume's uid and gid
+        stat = self.mount_a.stat(subvol_path)
+        self.assertEqual(stat['st_uid'], expected_uid)
+        self.assertEqual(stat['st_gid'], expected_gid)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_ls(self):
+        # tests the 'fs subvolume ls' command
+
+        subvolumes = []
+
+        # create subvolumes
+        subvolumes = self._generate_random_subvolume_name(3)
+        for subvolume in subvolumes:
+            self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # list subvolumes
+        subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        if len(subvolumels) == 0:
+            self.fail("Expected the 'fs subvolume ls' command to list the created subvolumes.")
+        else:
+            subvolnames = [subvolume['name'] for subvolume in subvolumels]
+            if collections.Counter(subvolnames) != collections.Counter(subvolumes):
+                self.fail("Error creating or listing subvolumes")
+
+        # remove subvolume
+        for subvolume in subvolumes:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_ls_for_notexistent_default_group(self):
+        # tests the 'fs subvolume ls' command when the default group '_nogroup' doesn't exist
+        # prerequisite: we expect that the volume is created and the default group _nogroup is
+        # NOT created (i.e. a subvolume without group is not created)
+
+        # list subvolumes
+        subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        if len(subvolumels) > 0:
+            raise RuntimeError("Expected the 'fs subvolume ls' command to output an empty list.")
+
+    def test_subvolume_resize_infinite_size(self):
+        """
+        That a subvolume can be resized to an infinite size by unsetting its quota.
+        """
+
+        # create subvolume
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size",
+                     str(self.DEFAULT_FILE_SIZE*1024*1024))
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname)
+        self.assertNotEqual(subvolpath, None)
+
+        # resize inf
+        self._fs_cmd("subvolume", "resize", self.volname, subvolname, "inf")
+
+        # verify that the quota is None
+        size = self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes")
+        self.assertEqual(size, None)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_resize_infinite_size_future_writes(self):
+        """
+        That a subvolume can be resized to an infinite size and the future writes succeed.
+        """
+
+        # create subvolume
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size",
+                     str(self.DEFAULT_FILE_SIZE*1024*1024*5))
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname)
+        self.assertNotEqual(subvolpath, None)
+
+        # resize inf
+        self._fs_cmd("subvolume", "resize", self.volname, subvolname, "inf")
+
+        # verify that the quota is None
+        size = self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes")
+        self.assertEqual(size, None)
+
+        # create one file of 10MB and try to write
+        file_size=self.DEFAULT_FILE_SIZE*10
+        number_of_files=1
+        log.debug("filling subvolume {0} with {1} file of size {2}MB".format(subvolname,
+                                                                             number_of_files,
+                                                                             file_size))
+        filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, self.DEFAULT_NUMBER_OF_FILES+5)
+
+        try:
+            self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size)
+        except CommandFailedError:
+            self.fail("expected filling subvolume {0} with {1} file of size {2}MB "
+                               "to succeed".format(subvolname, number_of_files, file_size))
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_info(self):
+        # tests the 'fs subvolume info' command
+
+        subvol_md = ["atime", "bytes_pcent", "bytes_quota", "bytes_used", "created_at", "ctime",
+                     "data_pool", "gid", "mode", "mon_addrs", "mtime", "path", "pool_namespace",
+                     "type", "uid", "features", "state"]
+
+        # create subvolume
+        subvolume = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # get subvolume metadata
+        subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume))
+        for md in subvol_md:
+            self.assertIn(md, subvol_info, "'{0}' key not present in metadata of subvolume".format(md))
+
+        self.assertEqual(subvol_info["bytes_pcent"], "undefined", "bytes_pcent should be set to undefined if quota is not set")
+        self.assertEqual(subvol_info["bytes_quota"], "infinite", "bytes_quota should be set to infinite if quota is not set")
+        self.assertEqual(subvol_info["pool_namespace"], "", "expected pool namespace to be empty")
+        self.assertEqual(subvol_info["state"], "complete", "expected state to be complete")
+
+        self.assertEqual(len(subvol_info["features"]), 3,
+                         msg="expected 3 features, found '{0}' ({1})".format(len(subvol_info["features"]), subvol_info["features"]))
+        for feature in ['snapshot-clone', 'snapshot-autoprotect', 'snapshot-retention']:
+            self.assertIn(feature, subvol_info["features"], msg="expected feature '{0}' in subvolume".format(feature))
+
+        nsize = self.DEFAULT_FILE_SIZE*1024*1024
+        self._fs_cmd("subvolume", "resize", self.volname, subvolume, str(nsize))
+
+        # get subvolume metadata after quota set
+        subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume))
+        for md in subvol_md:
+            self.assertIn(md, subvol_info, "'{0}' key not present in metadata of subvolume".format(md))
+
+        self.assertNotEqual(subvol_info["bytes_pcent"], "undefined", "bytes_pcent should not be set to undefined if quota is not set")
+        self.assertEqual(subvol_info["bytes_quota"], nsize, "bytes_quota should be set to '{0}'".format(nsize))
+        self.assertEqual(subvol_info["type"], "subvolume", "type should be set to subvolume")
+        self.assertEqual(subvol_info["state"], "complete", "expected state to be complete")
+
+        self.assertEqual(len(subvol_info["features"]), 3,
+                         msg="expected 3 features, found '{0}' ({1})".format(len(subvol_info["features"]), subvol_info["features"]))
+        for feature in ['snapshot-clone', 'snapshot-autoprotect', 'snapshot-retention']:
+            self.assertIn(feature, subvol_info["features"], msg="expected feature '{0}' in subvolume".format(feature))
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_clone_subvolume_info(self):
+
+        # tests the 'fs subvolume info' command for a clone
+        subvol_md = ["atime", "bytes_pcent", "bytes_quota", "bytes_used", "created_at", "ctime",
+                     "data_pool", "gid", "mode", "mon_addrs", "mtime", "path", "pool_namespace",
+                     "type", "uid"]
+
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=1)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        subvol_info = json.loads(self._get_subvolume_info(self.volname, clone))
+        if len(subvol_info) == 0:
+            raise RuntimeError("Expected the 'fs subvolume info' command to list metadata of subvolume")
+        for md in subvol_md:
+            if md not in subvol_info.keys():
+                raise RuntimeError("%s not present in the metadata of subvolume" % md)
+        if subvol_info["type"] != "clone":
+            raise RuntimeError("type should be set to clone")
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+
+    ### subvolume group operations
+
+    def test_subvolume_create_and_rm_in_group(self):
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_group_create_with_desired_data_pool_layout(self):
+        group1, group2 = self._generate_random_group_name(2)
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group1)
+        group1_path = self._get_subvolume_group_path(self.volname, group1)
+
+        default_pool = self.mount_a.getfattr(group1_path, "ceph.dir.layout.pool")
+        new_pool = "new_pool"
+        self.assertNotEqual(default_pool, new_pool)
+
+        # add data pool
+        self.fs.add_data_pool(new_pool)
+
+        # create group specifying the new data pool as its pool layout
+        self._fs_cmd("subvolumegroup", "create", self.volname, group2,
+                     "--pool_layout", new_pool)
+        group2_path = self._get_subvolume_group_path(self.volname, group2)
+
+        desired_pool = self.mount_a.getfattr(group2_path, "ceph.dir.layout.pool")
+        self.assertEqual(desired_pool, new_pool)
+
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group1)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group2)
+
+    ### authorize operations
+
+    def test_authorize_deauthorize_legacy_subvolume(self):
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        authid = "alice"
+
+        guest_mount = self.mount_b
+        guest_mount.umount_wait()
+
+        # emulate a old-fashioned subvolume in a custom group
+        createpath = os.path.join(".", "volumes", group, subvolume)
+        self.mount_a.run_shell(['mkdir', '-p', createpath])
+
+        # add required xattrs to subvolume
+        default_pool = self.mount_a.getfattr(".", "ceph.dir.layout.pool")
+        self.mount_a.setfattr(createpath, 'ceph.dir.layout.pool', default_pool)
+
+        mount_path = os.path.join("/", "volumes", group, subvolume)
+
+        # authorize guest authID read-write access to subvolume
+        key = self._fs_cmd("subvolume", "authorize", self.volname, subvolume, authid,
+                           "--group_name", group, "--tenant_id", "tenant_id")
+
+        # guest authID should exist
+        existing_ids = [a['entity'] for a in self.auth_list()]
+        self.assertIn("client.{0}".format(authid), existing_ids)
+
+        # configure credentials for guest client
+        self._configure_guest_auth(guest_mount, authid, key)
+
+        # mount the subvolume, and write to it
+        guest_mount.mount(mount_path=mount_path)
+        guest_mount.write_n_mb("data.bin", 1)
+
+        # authorize guest authID read access to subvolume
+        key = self._fs_cmd("subvolume", "authorize", self.volname, subvolume, authid,
+                           "--group_name", group, "--tenant_id", "tenant_id", "--access_level", "r")
+
+        # guest client sees the change in access level to read only after a
+        # remount of the subvolume.
+        guest_mount.umount_wait()
+        guest_mount.mount(mount_path=mount_path)
+
+        # read existing content of the subvolume
+        self.assertListEqual(guest_mount.ls(guest_mount.mountpoint), ["data.bin"])
+        # cannot write into read-only subvolume
+        with self.assertRaises(CommandFailedError):
+            guest_mount.write_n_mb("rogue.bin", 1)
+
+        # cleanup
+        guest_mount.umount_wait()
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, authid,
+                     "--group_name", group)
+        # guest authID should no longer exist
+        existing_ids = [a['entity'] for a in self.auth_list()]
+        self.assertNotIn("client.{0}".format(authid), existing_ids)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_authorize_deauthorize_subvolume(self):
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        authid = "alice"
+
+        guest_mount = self.mount_b
+        guest_mount.umount_wait()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+        mount_path = self._fs_cmd("subvolume", "getpath", self.volname, subvolume,
+                                  "--group_name", group).rstrip()
+
+        # authorize guest authID read-write access to subvolume
+        key = self._fs_cmd("subvolume", "authorize", self.volname, subvolume, authid,
+                           "--group_name", group, "--tenant_id", "tenant_id")
+
+        # guest authID should exist
+        existing_ids = [a['entity'] for a in self.auth_list()]
+        self.assertIn("client.{0}".format(authid), existing_ids)
+
+        # configure credentials for guest client
+        self._configure_guest_auth(guest_mount, authid, key)
+
+        # mount the subvolume, and write to it
+        guest_mount.mount(mount_path=mount_path)
+        guest_mount.write_n_mb("data.bin", 1)
+
+        # authorize guest authID read access to subvolume
+        key = self._fs_cmd("subvolume", "authorize", self.volname, subvolume, authid,
+                           "--group_name", group, "--tenant_id", "tenant_id", "--access_level", "r")
+
+        # guest client sees the change in access level to read only after a
+        # remount of the subvolume.
+        guest_mount.umount_wait()
+        guest_mount.mount(mount_path=mount_path)
+
+        # read existing content of the subvolume
+        self.assertListEqual(guest_mount.ls(guest_mount.mountpoint), ["data.bin"])
+        # cannot write into read-only subvolume
+        with self.assertRaises(CommandFailedError):
+            guest_mount.write_n_mb("rogue.bin", 1)
+
+        # cleanup
+        guest_mount.umount_wait()
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, authid,
+                     "--group_name", group)
+        # guest authID should no longer exist
+        existing_ids = [a['entity'] for a in self.auth_list()]
+        self.assertNotIn("client.{0}".format(authid), existing_ids)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_multitenant_subvolumes(self):
+        """
+        That subvolume access can be restricted to a tenant.
+
+        That metadata used to enforce tenant isolation of
+        subvolumes is stored as a two-way mapping between auth
+        IDs and subvolumes that they're authorized to access.
+        """
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        guest_mount = self.mount_b
+
+        # Guest clients belonging to different tenants, but using the same
+        # auth ID.
+        auth_id = "alice"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+        guestclient_2 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant2",
+        }
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # Check that subvolume metadata file is created on subvolume creation.
+        subvol_metadata_filename = "_{0}:{1}.meta".format(group, subvolume)
+        self.assertIn(subvol_metadata_filename, guest_mount.ls("volumes"))
+
+        # Authorize 'guestclient_1', using auth ID 'alice' and belonging to
+        # 'tenant1', with 'rw' access to the volume.
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+        # Check that auth metadata file for auth ID 'alice', is
+        # created on authorizing 'alice' access to the subvolume.
+        auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"])
+        self.assertIn(auth_metadata_filename, guest_mount.ls("volumes"))
+
+        # Verify that the auth metadata file stores the tenant ID that the
+        # auth ID belongs to, the auth ID's authorized access levels
+        # for different subvolumes, versioning details, etc.
+        expected_auth_metadata = {
+            "version": 5,
+            "compat_version": 6,
+            "dirty": False,
+            "tenant_id": "tenant1",
+            "subvolumes": {
+                "{0}/{1}".format(group,subvolume): {
+                    "dirty": False,
+                    "access_level": "rw"
+                }
+            }
+        }
+
+        auth_metadata = self._auth_metadata_get(guest_mount.read_file("volumes/{0}".format(auth_metadata_filename)))
+        self.assertGreaterEqual(auth_metadata["version"], expected_auth_metadata["version"])
+        del expected_auth_metadata["version"]
+        del auth_metadata["version"]
+        self.assertEqual(expected_auth_metadata, auth_metadata)
+
+        # Verify that the subvolume metadata file stores info about auth IDs
+        # and their access levels to the subvolume, versioning details, etc.
+        expected_subvol_metadata = {
+            "version": 1,
+            "compat_version": 1,
+            "auths": {
+                "alice": {
+                    "dirty": False,
+                    "access_level": "rw"
+                }
+            }
+        }
+        subvol_metadata = self._auth_metadata_get(guest_mount.read_file("volumes/{0}".format(subvol_metadata_filename)))
+
+        self.assertGreaterEqual(subvol_metadata["version"], expected_subvol_metadata["version"])
+        del expected_subvol_metadata["version"]
+        del subvol_metadata["version"]
+        self.assertEqual(expected_subvol_metadata, subvol_metadata)
+
+        # Cannot authorize 'guestclient_2' to access the volume.
+        # It uses auth ID 'alice', which has already been used by a
+        # 'guestclient_1' belonging to an another tenant for accessing
+        # the volume.
+
+        try:
+            self._fs_cmd("subvolume", "authorize", self.volname, subvolume, guestclient_2["auth_id"],
+                         "--group_name", group, "--tenant_id", guestclient_2["tenant_id"])
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EPERM,
+                             "Invalid error code returned on authorize of subvolume with same auth_id but different tenant_id")
+        else:
+            self.fail("expected the 'fs subvolume authorize' command to fail")
+
+        # Check that auth metadata file is cleaned up on removing
+        # auth ID's only access to a volume.
+
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, auth_id,
+                     "--group_name", group)
+        self.assertNotIn(auth_metadata_filename, guest_mount.ls("volumes"))
+
+        # Check that subvolume metadata file is cleaned up on subvolume deletion.
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group)
+        self.assertNotIn(subvol_metadata_filename, guest_mount.ls("volumes"))
+
+        # clean up
+        guest_mount.umount_wait()
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_authorized_list(self):
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        authid1 = "alice"
+        authid2 = "guest1"
+        authid3 = "guest2"
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # authorize alice authID read-write access to subvolume
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume, authid1,
+                     "--group_name", group)
+        # authorize guest1 authID read-write access to subvolume
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume, authid2,
+                     "--group_name", group)
+        # authorize guest2 authID read access to subvolume
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume, authid3,
+                     "--group_name", group, "--access_level", "r")
+
+        # list authorized-ids of the subvolume
+        expected_auth_list = [{'alice': 'rw'}, {'guest1': 'rw'}, {'guest2': 'r'}]
+        auth_list = json.loads(self._fs_cmd('subvolume', 'authorized_list', self.volname, subvolume, "--group_name", group))
+        self.assertCountEqual(expected_auth_list, auth_list)
+
+        # cleanup
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, authid1,
+                     "--group_name", group)
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, authid2,
+                     "--group_name", group)
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, authid3,
+                     "--group_name", group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_authorize_auth_id_not_created_by_mgr_volumes(self):
+        """
+        If the auth_id already exists and is not created by mgr plugin,
+        it's not allowed to authorize the auth-id by default.
+        """
+
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # Create auth_id
+        self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "get-or-create", "client.guest1",
+            "mds", "allow *",
+            "osd", "allow rw",
+            "mon", "allow *"
+        )
+
+        auth_id = "guest1"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        try:
+            self._fs_cmd("subvolume", "authorize", self.volname, subvolume, guestclient_1["auth_id"],
+                         "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EPERM,
+                             "Invalid error code returned on authorize of subvolume for auth_id created out of band")
+        else:
+            self.fail("expected the 'fs subvolume authorize' command to fail")
+
+        # clean up
+        self.fs.mon_manager.raw_cluster_cmd("auth", "rm", "client.guest1")
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_authorize_allow_existing_id_option(self):
+        """
+        If the auth_id already exists and is not created by mgr volumes,
+        it's not allowed to authorize the auth-id by default but is
+        allowed with option allow_existing_id.
+        """
+
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # Create auth_id
+        self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "get-or-create", "client.guest1",
+            "mds", "allow *",
+            "osd", "allow rw",
+            "mon", "allow *"
+        )
+
+        auth_id = "guest1"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # Cannot authorize 'guestclient_1' to access the volume by default,
+        # which already exists and not created by mgr volumes but is allowed
+        # with option 'allow_existing_id'.
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"], "--allow-existing-id")
+
+        # clean up
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, auth_id,
+                     "--group_name", group)
+        self.fs.mon_manager.raw_cluster_cmd("auth", "rm", "client.guest1")
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_deauthorize_auth_id_after_out_of_band_update(self):
+        """
+        If the auth_id authorized by mgr/volumes plugin is updated
+        out of band, the auth_id should not be deleted after a
+        deauthorize. It should only remove caps associated with it.
+        """
+
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        auth_id = "guest1"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # Authorize 'guestclient_1' to access the subvolume.
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+        subvol_path = self._fs_cmd("subvolume", "getpath", self.volname, subvolume,
+                                  "--group_name", group).rstrip()
+
+        # Update caps for guestclient_1 out of band
+        out = self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "caps", "client.guest1",
+            "mds", "allow rw path=/volumes/{0}, allow rw path={1}".format(group, subvol_path),
+            "osd", "allow rw pool=cephfs_data",
+            "mon", "allow r",
+            "mgr", "allow *"
+        )
+
+        # Deauthorize guestclient_1
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, auth_id, "--group_name", group)
+
+        # Validate the caps of guestclient_1 after deauthorize. It should not have deleted
+        # guestclient_1. The mgr and mds caps should be present which was updated out of band.
+        out = json.loads(self.fs.mon_manager.raw_cluster_cmd("auth", "get", "client.guest1", "--format=json-pretty"))
+
+        self.assertEqual("client.guest1", out[0]["entity"])
+        self.assertEqual("allow rw path=/volumes/{0}".format(group), out[0]["caps"]["mds"])
+        self.assertEqual("allow *", out[0]["caps"]["mgr"])
+        self.assertNotIn("osd", out[0]["caps"])
+
+        # clean up
+        out = self.fs.mon_manager.raw_cluster_cmd("auth", "rm", "client.guest1")
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_recover_auth_metadata_during_authorize(self):
+        """
+        That auth metadata manager can recover from partial auth updates using
+        metadata files, which store auth info and its update status info. This
+        test validates the recovery during authorize.
+        """
+
+        guest_mount = self.mount_b
+
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        auth_id = "guest1"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # Authorize 'guestclient_1' to access the subvolume.
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+        # Check that auth metadata file for auth ID 'guest1', is
+        # created on authorizing 'guest1' access to the subvolume.
+        auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"])
+        self.assertIn(auth_metadata_filename, guest_mount.ls("volumes"))
+        expected_auth_metadata_content = self._auth_metadata_get(self.mount_a.read_file("volumes/{0}".format(auth_metadata_filename)))
+
+        # Induce partial auth update state by modifying the auth metadata file,
+        # and then run authorize again.
+        guest_mount.run_shell(['sed', '-i', 's/false/true/g', 'volumes/{0}'.format(auth_metadata_filename)])
+
+        # Authorize 'guestclient_1' to access the subvolume.
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+        auth_metadata_content = self._auth_metadata_get(self.mount_a.read_file("volumes/{0}".format(auth_metadata_filename)))
+        self.assertEqual(auth_metadata_content, expected_auth_metadata_content)
+
+        # clean up
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, auth_id, "--group_name", group)
+        guest_mount.umount_wait()
+        self.fs.mon_manager.raw_cluster_cmd("auth", "rm", "client.guest1")
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_recover_auth_metadata_during_deauthorize(self):
+        """
+        That auth metadata manager can recover from partial auth updates using
+        metadata files, which store auth info and its update status info. This
+        test validates the recovery during deauthorize.
+        """
+
+        guest_mount = self.mount_b
+
+        subvolume1, subvolume2 = self._generate_random_subvolume_name(2)
+        group = self._generate_random_group_name()
+
+        guestclient_1 = {
+            "auth_id": "guest1",
+            "tenant_id": "tenant1",
+        }
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolumes in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume1, "--group_name", group)
+        self._fs_cmd("subvolume", "create", self.volname, subvolume2, "--group_name", group)
+
+        # Authorize 'guestclient_1' to access the subvolume1.
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume1, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+        # Check that auth metadata file for auth ID 'guest1', is
+        # created on authorizing 'guest1' access to the subvolume1.
+        auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"])
+        self.assertIn(auth_metadata_filename, guest_mount.ls("volumes"))
+        expected_auth_metadata_content = self._auth_metadata_get(self.mount_a.read_file("volumes/{0}".format(auth_metadata_filename)))
+
+        # Authorize 'guestclient_1' to access the subvolume2.
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume2, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+        # Induce partial auth update state by modifying the auth metadata file,
+        # and then run de-authorize.
+        guest_mount.run_shell(['sed', '-i', 's/false/true/g', 'volumes/{0}'.format(auth_metadata_filename)])
+
+        # Deauthorize 'guestclient_1' to access the subvolume2.
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume2, guestclient_1["auth_id"],
+                     "--group_name", group)
+
+        auth_metadata_content = self._auth_metadata_get(self.mount_a.read_file("volumes/{0}".format(auth_metadata_filename)))
+        self.assertEqual(auth_metadata_content, expected_auth_metadata_content)
+
+        # clean up
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume1, "guest1", "--group_name", group)
+        guest_mount.umount_wait()
+        self.fs.mon_manager.raw_cluster_cmd("auth", "rm", "client.guest1")
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume1, "--group_name", group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume2, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_update_old_style_auth_metadata_to_new_during_authorize(self):
+        """
+        CephVolumeClient stores the subvolume data in auth metadata file with
+        'volumes' key as there was no subvolume namespace. It doesn't makes sense
+        with mgr/volumes. This test validates the transparent update of 'volumes'
+        key to 'subvolumes' key in auth metadata file during authorize.
+        """
+
+        guest_mount = self.mount_b
+
+        subvolume1, subvolume2 = self._generate_random_subvolume_name(2)
+        group = self._generate_random_group_name()
+
+        auth_id = "guest1"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolumes in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume1, "--group_name", group)
+        self._fs_cmd("subvolume", "create", self.volname, subvolume2, "--group_name", group)
+
+        # Authorize 'guestclient_1' to access the subvolume1.
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume1, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+        # Check that auth metadata file for auth ID 'guest1', is
+        # created on authorizing 'guest1' access to the subvolume1.
+        auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"])
+        self.assertIn(auth_metadata_filename, guest_mount.ls("volumes"))
+
+        # Replace 'subvolumes' to 'volumes', old style auth-metadata file
+        guest_mount.run_shell(['sed', '-i', 's/subvolumes/volumes/g', 'volumes/{0}'.format(auth_metadata_filename)])
+
+        # Authorize 'guestclient_1' to access the subvolume2. This should transparently update 'volumes' to 'subvolumes'
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume2, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+        expected_auth_metadata = {
+            "version": 5,
+            "compat_version": 6,
+            "dirty": False,
+            "tenant_id": "tenant1",
+            "subvolumes": {
+                "{0}/{1}".format(group,subvolume1): {
+                    "dirty": False,
+                    "access_level": "rw"
+                },
+                "{0}/{1}".format(group,subvolume2): {
+                    "dirty": False,
+                    "access_level": "rw"
+                }
+            }
+        }
+
+        auth_metadata = self._auth_metadata_get(guest_mount.read_file("volumes/{0}".format(auth_metadata_filename)))
+
+        self.assertGreaterEqual(auth_metadata["version"], expected_auth_metadata["version"])
+        del expected_auth_metadata["version"]
+        del auth_metadata["version"]
+        self.assertEqual(expected_auth_metadata, auth_metadata)
+
+        # clean up
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume1, auth_id, "--group_name", group)
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume2, auth_id, "--group_name", group)
+        guest_mount.umount_wait()
+        self.fs.mon_manager.raw_cluster_cmd("auth", "rm", "client.guest1")
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume1, "--group_name", group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume2, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_update_old_style_auth_metadata_to_new_during_deauthorize(self):
+        """
+        CephVolumeClient stores the subvolume data in auth metadata file with
+        'volumes' key as there was no subvolume namespace. It doesn't makes sense
+        with mgr/volumes. This test validates the transparent update of 'volumes'
+        key to 'subvolumes' key in auth metadata file during deauthorize.
+        """
+
+        guest_mount = self.mount_b
+
+        subvolume1, subvolume2 = self._generate_random_subvolume_name(2)
+        group = self._generate_random_group_name()
+
+        auth_id = "guest1"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolumes in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume1, "--group_name", group)
+        self._fs_cmd("subvolume", "create", self.volname, subvolume2, "--group_name", group)
+
+        # Authorize 'guestclient_1' to access the subvolume1.
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume1, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+        # Authorize 'guestclient_1' to access the subvolume2.
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume2, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+        # Check that auth metadata file for auth ID 'guest1', is created.
+        auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"])
+        self.assertIn(auth_metadata_filename, guest_mount.ls("volumes"))
+
+        # Replace 'subvolumes' to 'volumes', old style auth-metadata file
+        guest_mount.run_shell(['sed', '-i', 's/subvolumes/volumes/g', 'volumes/{0}'.format(auth_metadata_filename)])
+
+        # Deauthorize 'guestclient_1' to access the subvolume2. This should update 'volumes' to subvolumes'
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume2, auth_id, "--group_name", group)
+
+        expected_auth_metadata = {
+            "version": 5,
+            "compat_version": 6,
+            "dirty": False,
+            "tenant_id": "tenant1",
+            "subvolumes": {
+                "{0}/{1}".format(group,subvolume1): {
+                    "dirty": False,
+                    "access_level": "rw"
+                }
+            }
+        }
+
+        auth_metadata = self._auth_metadata_get(guest_mount.read_file("volumes/{0}".format(auth_metadata_filename)))
+
+        self.assertGreaterEqual(auth_metadata["version"], expected_auth_metadata["version"])
+        del expected_auth_metadata["version"]
+        del auth_metadata["version"]
+        self.assertEqual(expected_auth_metadata, auth_metadata)
+
+        # clean up
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume1, auth_id, "--group_name", group)
+        guest_mount.umount_wait()
+        self.fs.mon_manager.raw_cluster_cmd("auth", "rm", "client.guest1")
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume1, "--group_name", group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume2, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_evict_client(self):
+        """
+        That a subvolume client can be evicted based on the auth ID
+        """
+
+        subvolumes = self._generate_random_subvolume_name(2)
+        group = self._generate_random_group_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # mounts[0] and mounts[1] would be used as guests to mount the volumes/shares.
+        for i in range(0, 2):
+            self.mounts[i].umount_wait()
+        guest_mounts = (self.mounts[0], self.mounts[1])
+        auth_id = "guest"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # Create two subvolumes. Authorize 'guest' auth ID to mount the two
+        # subvolumes. Mount the two subvolumes. Write data to the volumes.
+        for i in range(2):
+            # Create subvolume.
+            self._fs_cmd("subvolume", "create", self.volname, subvolumes[i], "--group_name", group)
+
+            # authorize guest authID read-write access to subvolume
+            key = self._fs_cmd("subvolume", "authorize", self.volname, subvolumes[i], guestclient_1["auth_id"],
+                               "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+            mount_path = self._fs_cmd("subvolume", "getpath", self.volname, subvolumes[i],
+                                      "--group_name", group).rstrip()
+            # configure credentials for guest client
+            self._configure_guest_auth(guest_mounts[i], auth_id, key)
+
+            # mount the subvolume, and write to it
+            guest_mounts[i].mount(mount_path=mount_path)
+            guest_mounts[i].write_n_mb("data.bin", 1)
+
+        # Evict client, guest_mounts[0], using auth ID 'guest' and has mounted
+        # one volume.
+        self._fs_cmd("subvolume", "evict", self.volname, subvolumes[0], auth_id, "--group_name", group)
+
+        # Evicted guest client, guest_mounts[0], should not be able to do
+        # anymore metadata ops.  It should start failing all operations
+        # when it sees that its own address is in the blocklist.
+        try:
+            guest_mounts[0].write_n_mb("rogue.bin", 1)
+        except CommandFailedError:
+            pass
+        else:
+            raise RuntimeError("post-eviction write should have failed!")
+
+        # The blocklisted guest client should now be unmountable
+        guest_mounts[0].umount_wait()
+
+        # Guest client, guest_mounts[1], using the same auth ID 'guest', but
+        # has mounted the other volume, should be able to use its volume
+        # unaffected.
+        guest_mounts[1].write_n_mb("data.bin.1", 1)
+
+        # Cleanup.
+        guest_mounts[1].umount_wait()
+        for i in range(2):
+            self._fs_cmd("subvolume", "deauthorize", self.volname, subvolumes[i], auth_id, "--group_name", group)
+            self._fs_cmd("subvolume", "rm", self.volname, subvolumes[i], "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_group_create_with_invalid_data_pool_layout(self):
+        group = self._generate_random_group_name()
+        data_pool = "invalid_pool"
+        # create group with invalid data pool layout
+        try:
+            self._fs_cmd("subvolumegroup", "create", self.volname, group, "--pool_layout", data_pool)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EINVAL:
+                raise
+        else:
+            raise RuntimeError("expected the 'fs subvolumegroup create' command to fail")
+
+    def test_subvolume_group_rm_force(self):
+        # test removing non-existing subvolume group with --force
+        group = self._generate_random_group_name()
+        try:
+            self._fs_cmd("subvolumegroup", "rm", self.volname, group, "--force")
+        except CommandFailedError as ce:
+            raise RuntimeError("expected the 'fs subvolumegroup rm --force' command to succeed")
+
+    def test_subvolume_group_create_with_auto_cleanup_on_fail(self):
+        group = self._generate_random_group_name()
+        data_pool = "invalid_pool"
+        # create group with invalid data pool layout
+        with self.assertRaises(CommandFailedError):
+            self._fs_cmd("subvolumegroup", "create", self.volname, group, "--pool_layout", data_pool)
+
+        # check whether group path is cleaned up
+        try:
+            self._fs_cmd("subvolumegroup", "getpath", self.volname, group)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOENT:
+                raise
+        else:
+            raise RuntimeError("expected the 'fs subvolumegroup getpath' command to fail")
+
+    def test_subvolume_create_with_desired_data_pool_layout_in_group(self):
+        subvol1, subvol2 = self._generate_random_subvolume_name(2)
+        group = self._generate_random_group_name()
+
+        # create group. this also helps set default pool layout for subvolumes
+        # created within the group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvol1, "--group_name", group)
+        subvol1_path = self._get_subvolume_path(self.volname, subvol1, group_name=group)
+
+        default_pool = self.mount_a.getfattr(subvol1_path, "ceph.dir.layout.pool")
+        new_pool = "new_pool"
+        self.assertNotEqual(default_pool, new_pool)
+
+        # add data pool
+        self.fs.add_data_pool(new_pool)
+
+        # create subvolume specifying the new data pool as its pool layout
+        self._fs_cmd("subvolume", "create", self.volname, subvol2, "--group_name", group,
+                     "--pool_layout", new_pool)
+        subvol2_path = self._get_subvolume_path(self.volname, subvol2, group_name=group)
+
+        desired_pool = self.mount_a.getfattr(subvol2_path, "ceph.dir.layout.pool")
+        self.assertEqual(desired_pool, new_pool)
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvol2, group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvol1, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_group_create_with_desired_mode(self):
+        group1, group2 = self._generate_random_group_name(2)
+        # default mode
+        expected_mode1 = "755"
+        # desired mode
+        expected_mode2 = "777"
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group1)
+        self._fs_cmd("subvolumegroup", "create", self.volname, group2, "--mode", "777")
+
+        group1_path = self._get_subvolume_group_path(self.volname, group1)
+        group2_path = self._get_subvolume_group_path(self.volname, group2)
+
+        # check group's mode
+        actual_mode1 = self.mount_a.run_shell(['stat', '-c' '%a', group1_path]).stdout.getvalue().strip()
+        actual_mode2 = self.mount_a.run_shell(['stat', '-c' '%a', group2_path]).stdout.getvalue().strip()
+        self.assertEqual(actual_mode1, expected_mode1)
+        self.assertEqual(actual_mode2, expected_mode2)
+
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group1)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group2)
+
+    def test_subvolume_group_create_with_desired_uid_gid(self):
+        """
+        That the subvolume group can be created with the desired uid and gid and its uid and gid matches the
+        expected values.
+        """
+        uid = 1000
+        gid = 1000
+
+        # create subvolume group
+        subvolgroupname = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, subvolgroupname, "--uid", str(uid), "--gid", str(gid))
+
+        # make sure it exists
+        subvolgrouppath = self._get_subvolume_group_path(self.volname, subvolgroupname)
+        self.assertNotEqual(subvolgrouppath, None)
+
+        # verify the uid and gid
+        suid = int(self.mount_a.run_shell(['stat', '-c' '%u', subvolgrouppath]).stdout.getvalue().strip())
+        sgid = int(self.mount_a.run_shell(['stat', '-c' '%g', subvolgrouppath]).stdout.getvalue().strip())
+        self.assertEqual(uid, suid)
+        self.assertEqual(gid, sgid)
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, subvolgroupname)
+
+    def test_subvolume_create_with_desired_mode_in_group(self):
+        subvol1, subvol2, subvol3 = self._generate_random_subvolume_name(3)
+
+        group = self._generate_random_group_name()
+        # default mode
+        expected_mode1 = "755"
+        # desired mode
+        expected_mode2 = "777"
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvol1, "--group_name", group)
+        self._fs_cmd("subvolume", "create", self.volname, subvol2, "--group_name", group, "--mode", "777")
+        # check whether mode 0777 also works
+        self._fs_cmd("subvolume", "create", self.volname, subvol3, "--group_name", group, "--mode", "0777")
+
+        subvol1_path = self._get_subvolume_path(self.volname, subvol1, group_name=group)
+        subvol2_path = self._get_subvolume_path(self.volname, subvol2, group_name=group)
+        subvol3_path = self._get_subvolume_path(self.volname, subvol3, group_name=group)
+
+        # check subvolume's  mode
+        actual_mode1 = self.mount_a.run_shell(['stat', '-c' '%a', subvol1_path]).stdout.getvalue().strip()
+        actual_mode2 = self.mount_a.run_shell(['stat', '-c' '%a', subvol2_path]).stdout.getvalue().strip()
+        actual_mode3 = self.mount_a.run_shell(['stat', '-c' '%a', subvol3_path]).stdout.getvalue().strip()
+        self.assertEqual(actual_mode1, expected_mode1)
+        self.assertEqual(actual_mode2, expected_mode2)
+        self.assertEqual(actual_mode3, expected_mode2)
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvol1, group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvol2, group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvol3, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_create_with_desired_uid_gid(self):
+        """
+        That the subvolume can be created with the desired uid and gid and its uid and gid matches the
+        expected values.
+        """
+        uid = 1000
+        gid = 1000
+
+        # create subvolume
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--uid", str(uid), "--gid", str(gid))
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname)
+        self.assertNotEqual(subvolpath, None)
+
+        # verify the uid and gid
+        suid = int(self.mount_a.run_shell(['stat', '-c' '%u', subvolpath]).stdout.getvalue().strip())
+        sgid = int(self.mount_a.run_shell(['stat', '-c' '%g', subvolpath]).stdout.getvalue().strip())
+        self.assertEqual(uid, suid)
+        self.assertEqual(gid, sgid)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_nonexistent_subvolume_group_rm(self):
+        group = "non_existent_group"
+
+        # try, remove subvolume group
+        try:
+            self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOENT:
+                raise
+        else:
+            raise RuntimeError("expected the 'fs subvolumegroup rm' command to fail")
+
+    def test_default_uid_gid_subvolume_group(self):
+        group = self._generate_random_group_name()
+        expected_uid = 0
+        expected_gid = 0
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+        group_path = self._get_subvolume_group_path(self.volname, group)
+
+        # check group's uid and gid
+        stat = self.mount_a.stat(group_path)
+        self.assertEqual(stat['st_uid'], expected_uid)
+        self.assertEqual(stat['st_gid'], expected_gid)
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_group_ls(self):
+        # tests the 'fs subvolumegroup ls' command
+
+        subvolumegroups = []
+
+        #create subvolumegroups
+        subvolumegroups = self._generate_random_group_name(3)
+        for groupname in subvolumegroups:
+            self._fs_cmd("subvolumegroup", "create", self.volname, groupname)
+
+        subvolumegroupls = json.loads(self._fs_cmd('subvolumegroup', 'ls', self.volname))
+        if len(subvolumegroupls) == 0:
+            raise RuntimeError("Expected the 'fs subvolumegroup ls' command to list the created subvolume groups")
+        else:
+            subvolgroupnames = [subvolumegroup['name'] for subvolumegroup in subvolumegroupls]
+            if collections.Counter(subvolgroupnames) != collections.Counter(subvolumegroups):
+                raise RuntimeError("Error creating or listing subvolume groups")
+
+    def test_subvolume_group_ls_for_nonexistent_volume(self):
+        # tests the 'fs subvolumegroup ls' command when /volume doesn't exist
+        # prerequisite: we expect that the test volume is created and a subvolumegroup is NOT created
+
+        # list subvolume groups
+        subvolumegroupls = json.loads(self._fs_cmd('subvolumegroup', 'ls', self.volname))
+        if len(subvolumegroupls) > 0:
+            raise RuntimeError("Expected the 'fs subvolumegroup ls' command to output an empty list")
+
+    ### snapshot operations
+
+    def test_subvolume_snapshot_create_and_rm(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_info(self):
+
+        """
+        tests the 'fs subvolume snapshot info' command
+        """
+
+        snap_md = ["created_at", "data_pool", "has_pending_clones", "size"]
+
+        subvolume = self._generate_random_subvolume_name()
+        snapshot, snap_missing = self._generate_random_snapshot_name(2)
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=1)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        snap_info = json.loads(self._get_subvolume_snapshot_info(self.volname, subvolume, snapshot))
+        for md in snap_md:
+            self.assertIn(md, snap_info, "'{0}' key not present in metadata of snapshot".format(md))
+        self.assertEqual(snap_info["has_pending_clones"], "no")
+
+        # snapshot info for non-existent snapshot
+        try:
+            self._get_subvolume_snapshot_info(self.volname, subvolume, snap_missing)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on snapshot info of non-existent snapshot")
+        else:
+            self.fail("expected snapshot info of non-existent snapshot to fail")
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_create_idempotence(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # try creating w/ same subvolume snapshot name -- should be idempotent
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_nonexistent_subvolume_snapshot_rm(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove snapshot again
+        try:
+            self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOENT:
+                raise
+        else:
+            raise RuntimeError("expected the 'fs subvolume snapshot rm' command to fail")
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_rm_force(self):
+        # test removing non existing subvolume snapshot with --force
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # remove snapshot
+        try:
+            self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot, "--force")
+        except CommandFailedError as ce:
+            raise RuntimeError("expected the 'fs subvolume snapshot rm --force' command to succeed")
+
+    def test_subvolume_snapshot_in_group(self):
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # snapshot subvolume in group
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot, group)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot, group)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_snapshot_ls(self):
+        # tests the 'fs subvolume snapshot ls' command
+
+        snapshots = []
+
+        # create subvolume
+        subvolume = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # create subvolume snapshots
+        snapshots = self._generate_random_snapshot_name(3)
+        for snapshot in snapshots:
+            self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        subvolsnapshotls = json.loads(self._fs_cmd('subvolume', 'snapshot', 'ls', self.volname, subvolume))
+        if len(subvolsnapshotls) == 0:
+            self.fail("Expected the 'fs subvolume snapshot ls' command to list the created subvolume snapshots")
+        else:
+            snapshotnames = [snapshot['name'] for snapshot in subvolsnapshotls]
+            if collections.Counter(snapshotnames) != collections.Counter(snapshots):
+                self.fail("Error creating or listing subvolume snapshots")
+
+        # remove snapshot
+        for snapshot in snapshots:
+            self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_group_snapshot_unsupported_status(self):
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # snapshot group
+        try:
+            self._fs_cmd("subvolumegroup", "snapshot", "create", self.volname, group, snapshot)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOSYS, "invalid error code on subvolumegroup snapshot create")
+        else:
+            self.fail("expected subvolumegroup snapshot create command to fail")
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    @unittest.skip("skipping subvolumegroup snapshot tests")
+    def test_subvolume_group_snapshot_create_and_rm(self):
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # snapshot group
+        self._fs_cmd("subvolumegroup", "snapshot", "create", self.volname, group, snapshot)
+
+        # remove snapshot
+        self._fs_cmd("subvolumegroup", "snapshot", "rm", self.volname, group, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    @unittest.skip("skipping subvolumegroup snapshot tests")
+    def test_subvolume_group_snapshot_idempotence(self):
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # snapshot group
+        self._fs_cmd("subvolumegroup", "snapshot", "create", self.volname, group, snapshot)
+
+        # try creating snapshot w/ same snapshot name -- shoule be idempotent
+        self._fs_cmd("subvolumegroup", "snapshot", "create", self.volname, group, snapshot)
+
+        # remove snapshot
+        self._fs_cmd("subvolumegroup", "snapshot", "rm", self.volname, group, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    @unittest.skip("skipping subvolumegroup snapshot tests")
+    def test_nonexistent_subvolume_group_snapshot_rm(self):
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # snapshot group
+        self._fs_cmd("subvolumegroup", "snapshot", "create", self.volname, group, snapshot)
+
+        # remove snapshot
+        self._fs_cmd("subvolumegroup", "snapshot", "rm", self.volname, group, snapshot)
+
+        # remove snapshot
+        try:
+            self._fs_cmd("subvolumegroup", "snapshot", "rm", self.volname, group, snapshot)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOENT:
+                raise
+        else:
+            raise RuntimeError("expected the 'fs subvolumegroup snapshot rm' command to fail")
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    @unittest.skip("skipping subvolumegroup snapshot tests")
+    def test_subvolume_group_snapshot_rm_force(self):
+        # test removing non-existing subvolume group snapshot with --force
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+        # remove snapshot
+        try:
+            self._fs_cmd("subvolumegroup", "snapshot", "rm", self.volname, group, snapshot, "--force")
+        except CommandFailedError as ce:
+            raise RuntimeError("expected the 'fs subvolumegroup snapshot rm --force' command to succeed")
+
+    @unittest.skip("skipping subvolumegroup snapshot tests")
+    def test_subvolume_group_snapshot_ls(self):
+        # tests the 'fs subvolumegroup snapshot ls' command
+
+        snapshots = []
+
+        # create group
+        group = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolumegroup snapshots
+        snapshots = self._generate_random_snapshot_name(3)
+        for snapshot in snapshots:
+            self._fs_cmd("subvolumegroup", "snapshot", "create", self.volname, group, snapshot)
+
+        subvolgrpsnapshotls = json.loads(self._fs_cmd('subvolumegroup', 'snapshot', 'ls', self.volname, group))
+        if len(subvolgrpsnapshotls) == 0:
+            raise RuntimeError("Expected the 'fs subvolumegroup snapshot ls' command to list the created subvolume group snapshots")
+        else:
+            snapshotnames = [snapshot['name'] for snapshot in subvolgrpsnapshotls]
+            if collections.Counter(snapshotnames) != collections.Counter(snapshots):
+                raise RuntimeError("Error creating or listing subvolume group snapshots")
+
+    def test_async_subvolume_rm(self):
+        subvolumes = self._generate_random_subvolume_name(100)
+
+        # create subvolumes
+        for subvolume in subvolumes:
+            self._fs_cmd("subvolume", "create", self.volname, subvolume)
+            self._do_subvolume_io(subvolume, number_of_files=10)
+
+        self.mount_a.umount_wait()
+
+        # remove subvolumes
+        for subvolume in subvolumes:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        self.mount_a.mount()
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty(timeout=300)
+
+    def test_subvolume_inherited_snapshot_ls(self):
+        # tests the scenario where 'fs subvolume snapshot ls' command
+        # should not list inherited snapshots created as part of snapshot
+        # at ancestral level
+
+        snapshots = []
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snap_count = 3
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # create subvolume snapshots
+        snapshots = self._generate_random_snapshot_name(snap_count)
+        for snapshot in snapshots:
+            self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot, group)
+
+        # Create snapshot at ancestral level
+        ancestral_snappath1 = os.path.join(".", "volumes", group, ".snap", "ancestral_snap_1")
+        ancestral_snappath2 = os.path.join(".", "volumes", group, ".snap", "ancestral_snap_2")
+        self.mount_a.run_shell(['mkdir', '-p', ancestral_snappath1, ancestral_snappath2])
+
+        subvolsnapshotls = json.loads(self._fs_cmd('subvolume', 'snapshot', 'ls', self.volname, subvolume, group))
+        self.assertEqual(len(subvolsnapshotls), snap_count)
+
+        # remove ancestral snapshots
+        self.mount_a.run_shell(['rmdir', ancestral_snappath1, ancestral_snappath2])
+
+        # remove snapshot
+        for snapshot in snapshots:
+            self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot, group)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_inherited_snapshot_info(self):
+        """
+        tests the scenario where 'fs subvolume snapshot info' command
+        should fail for inherited snapshots created as part of snapshot
+        at ancestral level
+        """
+
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # Create snapshot at ancestral level
+        ancestral_snap_name = "ancestral_snap_1"
+        ancestral_snappath1 = os.path.join(".", "volumes", group, ".snap", ancestral_snap_name)
+        self.mount_a.run_shell(['mkdir', '-p', ancestral_snappath1])
+
+        # Validate existence of inherited snapshot
+        group_path = os.path.join(".", "volumes", group)
+        inode_number_group_dir = int(self.mount_a.run_shell(['stat', '-c' '%i', group_path]).stdout.getvalue().strip())
+        inherited_snap = "_{0}_{1}".format(ancestral_snap_name, inode_number_group_dir)
+        inherited_snappath = os.path.join(".", "volumes", group, subvolume,".snap", inherited_snap)
+        self.mount_a.run_shell(['ls', inherited_snappath])
+
+        # snapshot info on inherited snapshot
+        try:
+            self._get_subvolume_snapshot_info(self.volname, subvolume, inherited_snap, group)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on snapshot info of inherited snapshot")
+        else:
+            self.fail("expected snapshot info of inherited snapshot to fail")
+
+        # remove ancestral snapshots
+        self.mount_a.run_shell(['rmdir', ancestral_snappath1])
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_inherited_snapshot_rm(self):
+        """
+        tests the scenario where 'fs subvolume snapshot rm' command
+        should fail for inherited snapshots created as part of snapshot
+        at ancestral level
+        """
+
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # Create snapshot at ancestral level
+        ancestral_snap_name = "ancestral_snap_1"
+        ancestral_snappath1 = os.path.join(".", "volumes", group, ".snap", ancestral_snap_name)
+        self.mount_a.run_shell(['mkdir', '-p', ancestral_snappath1])
+
+        # Validate existence of inherited snap
+        group_path = os.path.join(".", "volumes", group)
+        inode_number_group_dir = int(self.mount_a.run_shell(['stat', '-c' '%i', group_path]).stdout.getvalue().strip())
+        inherited_snap = "_{0}_{1}".format(ancestral_snap_name, inode_number_group_dir)
+        inherited_snappath = os.path.join(".", "volumes", group, subvolume,".snap", inherited_snap)
+        self.mount_a.run_shell(['ls', inherited_snappath])
+
+        # inherited snapshot should not be deletable
+        try:
+            self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, inherited_snap, "--group_name", group)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL, msg="invalid error code when removing inherited snapshot")
+        else:
+            self.fail("expected removing inheirted snapshot to fail")
+
+        # remove ancestral snapshots
+        self.mount_a.run_shell(['rmdir', ancestral_snappath1])
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_subvolumegroup_snapshot_name_conflict(self):
+        """
+        tests the scenario where creation of subvolume snapshot name
+        with same name as it's subvolumegroup snapshot name. This should
+        fail.
+        """
+
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        group_snapshot = self._generate_random_snapshot_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # Create subvolumegroup snapshot
+        group_snapshot_path = os.path.join(".", "volumes", group, ".snap", group_snapshot)
+        self.mount_a.run_shell(['mkdir', '-p', group_snapshot_path])
+
+        # Validate existence of subvolumegroup snapshot
+        self.mount_a.run_shell(['ls', group_snapshot_path])
+
+        # Creation of subvolume snapshot with it's subvolumegroup snapshot name should fail
+        try:
+            self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, group_snapshot, "--group_name", group)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL, msg="invalid error code when creating subvolume snapshot with same name as subvolume group snapshot")
+        else:
+            self.fail("expected subvolume snapshot creation with same name as subvolumegroup snapshot to fail")
+
+        # remove subvolumegroup snapshot
+        self.mount_a.run_shell(['rmdir', group_snapshot_path])
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_upgrade_legacy_to_v1(self):
+        """
+        poor man's upgrade test -- rather than going through a full upgrade cycle,
+        emulate subvolumes by going through the wormhole and verify if they are
+        accessible.
+        further ensure that a legacy volume is not updated to v2.
+        """
+        subvolume1, subvolume2 = self._generate_random_subvolume_name(2)
+        group = self._generate_random_group_name()
+
+        # emulate a old-fashioned subvolume -- one in the default group and
+        # the other in a custom group
+        createpath1 = os.path.join(".", "volumes", "_nogroup", subvolume1)
+        self.mount_a.run_shell(['mkdir', '-p', createpath1])
+
+        # create group
+        createpath2 = os.path.join(".", "volumes", group, subvolume2)
+        self.mount_a.run_shell(['mkdir', '-p', createpath2])
+
+        # this would auto-upgrade on access without anyone noticing
+        subvolpath1 = self._fs_cmd("subvolume", "getpath", self.volname, subvolume1)
+        self.assertNotEqual(subvolpath1, None)
+        subvolpath1 = subvolpath1.rstrip() # remove "/" prefix and any trailing newline
+
+        subvolpath2 = self._fs_cmd("subvolume", "getpath", self.volname, subvolume2, group)
+        self.assertNotEqual(subvolpath2, None)
+        subvolpath2 = subvolpath2.rstrip() # remove "/" prefix and any trailing newline
+
+        # and... the subvolume path returned should be what we created behind the scene
+        self.assertEqual(createpath1[1:], subvolpath1)
+        self.assertEqual(createpath2[1:], subvolpath2)
+
+        # ensure metadata file is in legacy location, with required version v1
+        self._assert_meta_location_and_version(self.volname, subvolume1, version=1, legacy=True)
+        self._assert_meta_location_and_version(self.volname, subvolume2, subvol_group=group, version=1, legacy=True)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume1)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume2, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_no_upgrade_v1_sanity(self):
+        """
+        poor man's upgrade test -- theme continues...
+
+        This test is to ensure v1 subvolumes are retained as is, due to a snapshot being present, and runs through
+        a series of operations on the v1 subvolume to ensure they work as expected.
+        """
+        subvol_md = ["atime", "bytes_pcent", "bytes_quota", "bytes_used", "created_at", "ctime",
+                     "data_pool", "gid", "mode", "mon_addrs", "mtime", "path", "pool_namespace",
+                     "type", "uid", "features", "state"]
+        snap_md = ["created_at", "data_pool", "has_pending_clones", "size"]
+
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone1, clone2 = self._generate_random_clone_name(2)
+        mode = "777"
+        uid  = "1000"
+        gid  = "1000"
+
+        # emulate a v1 subvolume -- in the default group
+        subvolume_path = self._create_v1_subvolume(subvolume)
+
+        # getpath
+        subvolpath = self._get_subvolume_path(self.volname, subvolume)
+        self.assertEqual(subvolpath, subvolume_path)
+
+        # ls
+        subvolumes = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        self.assertEqual(len(subvolumes), 1, "subvolume ls count mismatch, expected '1', found {0}".format(len(subvolumes)))
+        self.assertEqual(subvolumes[0]['name'], subvolume,
+                         "subvolume name mismatch in ls output, expected '{0}', found '{1}'".format(subvolume, subvolumes[0]['name']))
+
+        # info
+        subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume))
+        for md in subvol_md:
+            self.assertIn(md, subvol_info, "'{0}' key not present in metadata of subvolume".format(md))
+
+        self.assertEqual(subvol_info["state"], "complete",
+                         msg="expected state to be 'complete', found '{0}".format(subvol_info["state"]))
+        self.assertEqual(len(subvol_info["features"]), 2,
+                         msg="expected 1 feature, found '{0}' ({1})".format(len(subvol_info["features"]), subvol_info["features"]))
+        for feature in ['snapshot-clone', 'snapshot-autoprotect']:
+            self.assertIn(feature, subvol_info["features"], msg="expected feature '{0}' in subvolume".format(feature))
+
+        # resize
+        nsize = self.DEFAULT_FILE_SIZE*1024*1024*10
+        self._fs_cmd("subvolume", "resize", self.volname, subvolume, str(nsize))
+        subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume))
+        for md in subvol_md:
+            self.assertIn(md, subvol_info, "'{0}' key not present in metadata of subvolume".format(md))
+        self.assertEqual(subvol_info["bytes_quota"], nsize, "bytes_quota should be set to '{0}'".format(nsize))
+
+        # create (idempotent) (change some attrs, to ensure attrs are preserved from the snapshot on clone)
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode", mode, "--uid", uid, "--gid", gid)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=8)
+
+        # snap-create
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone1)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone1)
+
+        # ensure clone is v2
+        self._assert_meta_location_and_version(self.volname, clone1, version=2)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone1, source_version=1)
+
+        # clone (older snapshot)
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, 'fake', clone2)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone2)
+
+        # ensure clone is v2
+        self._assert_meta_location_and_version(self.volname, clone2, version=2)
+
+        # verify clone
+        # TODO: rentries will mismatch till this is fixed https://tracker.ceph.com/issues/46747
+        #self._verify_clone(subvolume, 'fake', clone2, source_version=1)
+
+        # snap-info
+        snap_info = json.loads(self._get_subvolume_snapshot_info(self.volname, subvolume, snapshot))
+        for md in snap_md:
+            self.assertIn(md, snap_info, "'{0}' key not present in metadata of snapshot".format(md))
+        self.assertEqual(snap_info["has_pending_clones"], "no")
+
+        # snap-ls
+        subvol_snapshots = json.loads(self._fs_cmd('subvolume', 'snapshot', 'ls', self.volname, subvolume))
+        self.assertEqual(len(subvol_snapshots), 2, "subvolume ls count mismatch, expected 2', found {0}".format(len(subvol_snapshots)))
+        snapshotnames = [snapshot['name'] for snapshot in subvol_snapshots]
+        for name in [snapshot, 'fake']:
+            self.assertIn(name, snapshotnames, msg="expected snapshot '{0}' in subvolume snapshot ls".format(name))
+
+        # snap-rm
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, "fake")
+
+        # ensure volume is still at version 1
+        self._assert_meta_location_and_version(self.volname, subvolume, version=1)
+
+        # rm
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone1)
+        self._fs_cmd("subvolume", "rm", self.volname, clone2)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_no_upgrade_v1_to_v2(self):
+        """
+        poor man's upgrade test -- theme continues...
+        ensure v1 to v2 upgrades are not done automatically due to various states of v1
+        """
+        subvolume1, subvolume2, subvolume3 = self._generate_random_subvolume_name(3)
+        group = self._generate_random_group_name()
+
+        # emulate a v1 subvolume -- in the default group
+        subvol1_path = self._create_v1_subvolume(subvolume1)
+
+        # emulate a v1 subvolume -- in a custom group
+        subvol2_path = self._create_v1_subvolume(subvolume2, subvol_group=group)
+
+        # emulate a v1 subvolume -- in a clone pending state
+        self._create_v1_subvolume(subvolume3, subvol_type='clone', has_snapshot=False, state='pending')
+
+        # this would attempt auto-upgrade on access, but fail to do so as snapshots exist
+        subvolpath1 = self._get_subvolume_path(self.volname, subvolume1)
+        self.assertEqual(subvolpath1, subvol1_path)
+
+        subvolpath2 = self._get_subvolume_path(self.volname, subvolume2, group_name=group)
+        self.assertEqual(subvolpath2, subvol2_path)
+
+        # this would attempt auto-upgrade on access, but fail to do so as volume is not complete
+        # use clone status, as only certain operations are allowed in pending state
+        status = json.loads(self._fs_cmd("clone", "status", self.volname, subvolume3))
+        self.assertEqual(status["status"]["state"], "pending")
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume1, "fake")
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume2, "fake", group)
+
+        # ensure metadata file is in v1 location, with version retained as v1
+        self._assert_meta_location_and_version(self.volname, subvolume1, version=1)
+        self._assert_meta_location_and_version(self.volname, subvolume2, subvol_group=group, version=1)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume1)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume2, group)
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume3)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EAGAIN, "invalid error code on rm of subvolume undergoing clone")
+        else:
+            self.fail("expected rm of subvolume undergoing clone to fail")
+
+        # ensure metadata file is in v1 location, with version retained as v1
+        self._assert_meta_location_and_version(self.volname, subvolume3, version=1)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume3, "--force")
+
+        # verify list subvolumes returns an empty list
+        subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        self.assertEqual(len(subvolumels), 0)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_upgrade_v1_to_v2(self):
+        """
+        poor man's upgrade test -- theme continues...
+        ensure v1 to v2 upgrades work
+        """
+        subvolume1, subvolume2 = self._generate_random_subvolume_name(2)
+        group = self._generate_random_group_name()
+
+        # emulate a v1 subvolume -- in the default group
+        subvol1_path = self._create_v1_subvolume(subvolume1, has_snapshot=False)
+
+        # emulate a v1 subvolume -- in a custom group
+        subvol2_path = self._create_v1_subvolume(subvolume2, subvol_group=group, has_snapshot=False)
+
+        # this would attempt auto-upgrade on access
+        subvolpath1 = self._get_subvolume_path(self.volname, subvolume1)
+        self.assertEqual(subvolpath1, subvol1_path)
+
+        subvolpath2 = self._get_subvolume_path(self.volname, subvolume2, group_name=group)
+        self.assertEqual(subvolpath2, subvol2_path)
+
+        # ensure metadata file is in v2 location, with version retained as v2
+        self._assert_meta_location_and_version(self.volname, subvolume1, version=2)
+        self._assert_meta_location_and_version(self.volname, subvolume2, subvol_group=group, version=2)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume1)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume2, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_rm_with_snapshots(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # remove subvolume -- should fail with ENOTEMPTY since it has snapshots
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOTEMPTY:
+                raise RuntimeError("invalid error code returned when deleting subvolume with snapshots")
+        else:
+            raise RuntimeError("expected subvolume deletion to fail")
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_retain_snapshot_without_snapshots(self):
+        """
+        ensure retain snapshots based delete of a subvolume with no snapshots, deletes the subbvolume
+        """
+        subvolume = self._generate_random_subvolume_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # remove with snapshot retention (should remove volume, no snapshots to retain)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # verify list subvolumes returns an empty list
+        subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        self.assertEqual(len(subvolumels), 0)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_retain_snapshot_with_snapshots(self):
+        """
+        ensure retain snapshots based delete of a subvolume with snapshots retains the subvolume
+        also test allowed and dis-allowed operations on a retained subvolume
+        """
+        snap_md = ["created_at", "data_pool", "has_pending_clones", "size"]
+
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # remove subvolume -- should fail with ENOTEMPTY since it has snapshots
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOTEMPTY, "invalid error code on rm of retained subvolume with snapshots")
+        else:
+            self.fail("expected rm of subvolume with retained snapshots to fail")
+
+        # remove with snapshot retention
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # fetch info
+        subvol_info = json.loads(self._fs_cmd("subvolume", "info", self.volname, subvolume))
+        self.assertEqual(subvol_info["state"], "snapshot-retained",
+                         msg="expected state to be 'snapshot-retained', found '{0}".format(subvol_info["state"]))
+
+        ## test allowed ops in retained state
+        # ls
+        subvolumes = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        self.assertEqual(len(subvolumes), 1, "subvolume ls count mismatch, expected '1', found {0}".format(len(subvolumes)))
+        self.assertEqual(subvolumes[0]['name'], subvolume,
+                         "subvolume name mismatch in ls output, expected '{0}', found '{1}'".format(subvolume, subvolumes[0]['name']))
+
+        # snapshot info
+        snap_info = json.loads(self._get_subvolume_snapshot_info(self.volname, subvolume, snapshot))
+        for md in snap_md:
+            self.assertIn(md, snap_info, "'{0}' key not present in metadata of snapshot".format(md))
+        self.assertEqual(snap_info["has_pending_clones"], "no")
+
+        # rm --force (allowed but should fail)
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--force")
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOTEMPTY, "invalid error code on rm of subvolume with retained snapshots")
+        else:
+            self.fail("expected rm of subvolume with retained snapshots to fail")
+
+        # rm (allowed but should fail)
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOTEMPTY, "invalid error code on rm of subvolume with retained snapshots")
+        else:
+            self.fail("expected rm of subvolume with retained snapshots to fail")
+
+        ## test disallowed ops
+        # getpath
+        try:
+            self._fs_cmd("subvolume", "getpath", self.volname, subvolume)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on getpath of subvolume with retained snapshots")
+        else:
+            self.fail("expected getpath of subvolume with retained snapshots to fail")
+
+        # resize
+        nsize = self.DEFAULT_FILE_SIZE*1024*1024
+        try:
+            self._fs_cmd("subvolume", "resize", self.volname, subvolume, str(nsize))
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on resize of subvolume with retained snapshots")
+        else:
+            self.fail("expected resize of subvolume with retained snapshots to fail")
+
+        # snap-create
+        try:
+            self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, "fail")
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on snapshot create of subvolume with retained snapshots")
+        else:
+            self.fail("expected snapshot create of subvolume with retained snapshots to fail")
+
+        # remove snapshot (should remove volume)
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # verify list subvolumes returns an empty list
+        subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        self.assertEqual(len(subvolumels), 0)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_retain_snapshot_invalid_recreate(self):
+        """
+        ensure retained subvolume recreate does not leave any incarnations in the subvolume and trash
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # remove with snapshot retention
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # recreate subvolume with an invalid pool
+        data_pool = "invalid_pool"
+        try:
+            self._fs_cmd("subvolume", "create", self.volname, subvolume, "--pool_layout", data_pool)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on recreate of subvolume with invalid poolname")
+        else:
+            self.fail("expected recreate of subvolume with invalid poolname to fail")
+
+        # fetch info
+        subvol_info = json.loads(self._fs_cmd("subvolume", "info", self.volname, subvolume))
+        self.assertEqual(subvol_info["state"], "snapshot-retained",
+                         msg="expected state to be 'snapshot-retained', found '{0}".format(subvol_info["state"]))
+
+        # getpath
+        try:
+            self._fs_cmd("subvolume", "getpath", self.volname, subvolume)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on getpath of subvolume with retained snapshots")
+        else:
+            self.fail("expected getpath of subvolume with retained snapshots to fail")
+
+        # remove snapshot (should remove volume)
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_retain_snapshot_trash_busy_recreate(self):
+        """
+        ensure retained subvolume recreate fails if its trash is not yet purged
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # remove with snapshot retention
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # fake a trash entry
+        self._update_fake_trash(subvolume)
+
+        # recreate subvolume
+        try:
+            self._fs_cmd("subvolume", "create", self.volname, subvolume)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EAGAIN, "invalid error code on recreate of subvolume with purge pending")
+        else:
+            self.fail("expected recreate of subvolume with purge pending to fail")
+
+        # clear fake trash entry
+        self._update_fake_trash(subvolume, create=False)
+
+        # recreate subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_retain_snapshot_trash_busy_recreate_clone(self):
+        """
+        ensure retained clone recreate fails if its trash is not yet purged
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # clone subvolume snapshot
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # snapshot clone
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, clone, snapshot)
+
+        # remove clone with snapshot retention
+        self._fs_cmd("subvolume", "rm", self.volname, clone, "--retain-snapshots")
+
+        # fake a trash entry
+        self._update_fake_trash(clone)
+
+        # clone subvolume snapshot (recreate)
+        try:
+            self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EAGAIN, "invalid error code on recreate of clone with purge pending")
+        else:
+            self.fail("expected recreate of clone with purge pending to fail")
+
+        # clear fake trash entry
+        self._update_fake_trash(clone, create=False)
+
+        # recreate subvolume
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, clone, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_retain_snapshot_recreate_subvolume(self):
+        """
+        ensure a retained subvolume can be recreated and further snapshotted
+        """
+        snap_md = ["created_at", "data_pool", "has_pending_clones", "size"]
+
+        subvolume = self._generate_random_subvolume_name()
+        snapshot1, snapshot2 = self._generate_random_snapshot_name(2)
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot1)
+
+        # remove with snapshot retention
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # fetch info
+        subvol_info = json.loads(self._fs_cmd("subvolume", "info", self.volname, subvolume))
+        self.assertEqual(subvol_info["state"], "snapshot-retained",
+                         msg="expected state to be 'snapshot-retained', found '{0}".format(subvol_info["state"]))
+
+        # recreate retained subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # fetch info
+        subvol_info = json.loads(self._fs_cmd("subvolume", "info", self.volname, subvolume))
+        self.assertEqual(subvol_info["state"], "complete",
+                         msg="expected state to be 'snapshot-retained', found '{0}".format(subvol_info["state"]))
+
+        # snapshot info (older snapshot)
+        snap_info = json.loads(self._get_subvolume_snapshot_info(self.volname, subvolume, snapshot1))
+        for md in snap_md:
+            self.assertIn(md, snap_info, "'{0}' key not present in metadata of snapshot".format(md))
+        self.assertEqual(snap_info["has_pending_clones"], "no")
+
+        # snap-create (new snapshot)
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot2)
+
+        # remove with retain snapshots
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # list snapshots
+        subvolsnapshotls = json.loads(self._fs_cmd('subvolume', 'snapshot', 'ls', self.volname, subvolume))
+        self.assertEqual(len(subvolsnapshotls), 2, "Expected the 'fs subvolume snapshot ls' command to list the"
+                         " created subvolume snapshots")
+        snapshotnames = [snapshot['name'] for snapshot in subvolsnapshotls]
+        for snap in [snapshot1, snapshot2]:
+            self.assertIn(snap, snapshotnames, "Missing snapshot '{0}' in snapshot list".format(snap))
+
+        # remove snapshots (should remove volume)
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot1)
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot2)
+
+        # verify list subvolumes returns an empty list
+        subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        self.assertEqual(len(subvolumels), 0)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_retain_snapshot_clone(self):
+        """
+        clone a snapshot from a snapshot retained subvolume
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # store path for clone verification
+        subvol_path = self._get_subvolume_path(self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=16)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # remove with snapshot retention
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # clone retained subvolume snapshot
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone, subvol_path=subvol_path)
+
+        # remove snapshots (removes retained volume)
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify list subvolumes returns an empty list
+        subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        self.assertEqual(len(subvolumels), 0)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_retain_snapshot_recreate(self):
+        """
+        recreate a subvolume from one of its retained snapshots
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # store path for clone verification
+        subvol_path = self._get_subvolume_path(self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=16)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # remove with snapshot retention
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # recreate retained subvolume using its own snapshot to clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, subvolume)
+
+        # check clone status
+        self._wait_for_clone_to_complete(subvolume)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, subvolume, subvol_path=subvol_path)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify list subvolumes returns an empty list
+        subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        self.assertEqual(len(subvolumels), 0)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_clone_retain_snapshot_with_snapshots(self):
+        """
+        retain snapshots of a cloned subvolume and check disallowed operations
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot1, snapshot2 = self._generate_random_snapshot_name(2)
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # store path for clone verification
+        subvol1_path = self._get_subvolume_path(self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=16)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot1)
+
+        # remove with snapshot retention
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # clone retained subvolume snapshot
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot1, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot1, clone, subvol_path=subvol1_path)
+
+        # create a snapshot on the clone
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, clone, snapshot2)
+
+        # retain a clone
+        self._fs_cmd("subvolume", "rm", self.volname, clone, "--retain-snapshots")
+
+        # list snapshots
+        clonesnapshotls = json.loads(self._fs_cmd('subvolume', 'snapshot', 'ls', self.volname, clone))
+        self.assertEqual(len(clonesnapshotls), 1, "Expected the 'fs subvolume snapshot ls' command to list the"
+                         " created subvolume snapshots")
+        snapshotnames = [snapshot['name'] for snapshot in clonesnapshotls]
+        for snap in [snapshot2]:
+            self.assertIn(snap, snapshotnames, "Missing snapshot '{0}' in snapshot list".format(snap))
+
+        ## check disallowed operations on retained clone
+        # clone-status
+        try:
+            self._fs_cmd("clone", "status", self.volname, clone)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on clone status of clone with retained snapshots")
+        else:
+            self.fail("expected clone status of clone with retained snapshots to fail")
+
+        # clone-cancel
+        try:
+            self._fs_cmd("clone", "cancel", self.volname, clone)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on clone cancel of clone with retained snapshots")
+        else:
+            self.fail("expected clone cancel of clone with retained snapshots to fail")
+
+        # remove snapshots (removes subvolumes as all are in retained state)
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot1)
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, clone, snapshot2)
+
+        # verify list subvolumes returns an empty list
+        subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        self.assertEqual(len(subvolumels), 0)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_retain_snapshot_clone_from_newer_snapshot(self):
+        """
+        clone a subvolume from recreated subvolume's latest snapshot
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot1, snapshot2 = self._generate_random_snapshot_name(2)
+        clone = self._generate_random_clone_name(1)
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=16)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot1)
+
+        # remove with snapshot retention
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # recreate subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # get and store path for clone verification
+        subvol2_path = self._get_subvolume_path(self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=16)
+
+        # snapshot newer subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot2)
+
+        # remove with snapshot retention
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # clone retained subvolume's newer snapshot
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot2, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot2, clone, subvol_path=subvol2_path)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot1)
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot2)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify list subvolumes returns an empty list
+        subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        self.assertEqual(len(subvolumels), 0)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_protect_unprotect_sanity(self):
+        """
+        Snapshot protect/unprotect commands are deprecated. This test exists to ensure that
+        invoking the command does not cause errors, till they are removed from a subsequent release.
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=64)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # now, protect snapshot
+        self._fs_cmd("subvolume", "snapshot", "protect", self.volname, subvolume, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # now, unprotect snapshot
+        self._fs_cmd("subvolume", "snapshot", "unprotect", self.volname, subvolume, snapshot)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=64)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_reconf_max_concurrent_clones(self):
+        """
+        Validate 'max_concurrent_clones' config option
+        """
+
+        # get the default number of cloner threads
+        default_max_concurrent_clones = int(self.config_get('mgr.x', 'mgr/volumes/max_concurrent_clones'))
+        self.assertEqual(default_max_concurrent_clones, 4)
+
+        # Increase number of cloner threads
+        self.config_set('mgr', 'mgr/volumes/max_concurrent_clones', 6)
+        max_concurrent_clones = int(self.config_get('mgr.x', 'mgr/volumes/max_concurrent_clones'))
+        self.assertEqual(max_concurrent_clones, 6)
+
+        # Decrease number of cloner threads
+        self.config_set('mgr', 'mgr/volumes/max_concurrent_clones', 2)
+        max_concurrent_clones = int(self.config_get('mgr.x', 'mgr/volumes/max_concurrent_clones'))
+        self.assertEqual(max_concurrent_clones, 2)
+
+    def test_subvolume_snapshot_clone_pool_layout(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # add data pool
+        new_pool = "new_pool"
+        self.fs.add_data_pool(new_pool)
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=32)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone, "--pool_layout", new_pool)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone, clone_pool=new_pool)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        subvol_path = self._get_subvolume_path(self.volname, clone)
+        desired_pool = self.mount_a.getfattr(subvol_path, "ceph.dir.layout.pool")
+        self.assertEqual(desired_pool, new_pool)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone_with_attrs(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        mode = "777"
+        uid  = "1000"
+        gid  = "1000"
+        new_uid  = "1001"
+        new_gid  = "1001"
+        new_mode = "700"
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode", mode, "--uid", uid, "--gid", gid)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=32)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # change subvolume attrs (to ensure clone picks up snapshot attrs)
+        self._do_subvolume_attr_update(subvolume, new_uid, new_gid, new_mode)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_clone_inherit_snapshot_namespace_and_size(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+        osize = self.DEFAULT_FILE_SIZE*1024*1024*12
+
+        # create subvolume, in an isolated namespace with a specified size
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--namespace-isolated", "--size", str(osize))
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=8)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # create a pool different from current subvolume pool
+        subvol_path = self._get_subvolume_path(self.volname, subvolume)
+        default_pool = self.mount_a.getfattr(subvol_path, "ceph.dir.layout.pool")
+        new_pool = "new_pool"
+        self.assertNotEqual(default_pool, new_pool)
+        self.fs.add_data_pool(new_pool)
+
+        # update source subvolume pool
+        self._do_subvolume_pool_and_namespace_update(subvolume, pool=new_pool, pool_namespace="")
+
+        # schedule a clone, with NO --pool specification
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone_and_reclone(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone1, clone2 = self._generate_random_clone_name(2)
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=32)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone1)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone1)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone1)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # now the clone is just like a normal subvolume -- snapshot the clone and fork
+        # another clone. before that do some IO so it's can be differentiated.
+        self._do_subvolume_io(clone1, create_dir="data", number_of_files=32)
+
+        # snapshot clone -- use same snap name
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, clone1, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, clone1, snapshot, clone2)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone2)
+
+        # verify clone
+        self._verify_clone(clone1, snapshot, clone2)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, clone1, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone1)
+        self._fs_cmd("subvolume", "rm", self.volname, clone2)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone_under_group(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+        group = self._generate_random_group_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=32)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone, '--target_group_name', group)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone, clone_group=group)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone, clone_group=group)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone, group)
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_under_group_snapshot_clone(self):
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, group)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, subvolume_group=group, number_of_files=32)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot, group)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone, '--group_name', group)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone, source_group=group)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot, group)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, group)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone_different_groups(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+        s_group, c_group = self._generate_random_group_name(2)
+
+        # create groups
+        self._fs_cmd("subvolumegroup", "create", self.volname, s_group)
+        self._fs_cmd("subvolumegroup", "create", self.volname, c_group)
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, s_group)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, subvolume_group=s_group, number_of_files=32)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot, s_group)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone,
+                     '--group_name', s_group, '--target_group_name', c_group)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone, clone_group=c_group)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone, source_group=s_group, clone_group=c_group)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot, s_group)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, s_group)
+        self._fs_cmd("subvolume", "rm", self.volname, clone, c_group)
+
+        # remove groups
+        self._fs_cmd("subvolumegroup", "rm", self.volname, s_group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, c_group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone_with_upgrade(self):
+        """
+        yet another poor man's upgrade test -- rather than going through a full
+        upgrade cycle, emulate old types subvolumes by going through the wormhole
+        and verify clone operation.
+        further ensure that a legacy volume is not updated to v2, but clone is.
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # emulate a old-fashioned subvolume
+        createpath = os.path.join(".", "volumes", "_nogroup", subvolume)
+        self.mount_a.run_shell(['mkdir', '-p', createpath])
+
+        # add required xattrs to subvolume
+        default_pool = self.mount_a.getfattr(".", "ceph.dir.layout.pool")
+        self.mount_a.setfattr(createpath, 'ceph.dir.layout.pool', default_pool)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=64)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # ensure metadata file is in legacy location, with required version v1
+        self._assert_meta_location_and_version(self.volname, subvolume, version=1, legacy=True)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # snapshot should not be deletable now
+        try:
+            self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EAGAIN, msg="invalid error code when removing source snapshot of a clone")
+        else:
+            self.fail("expected removing source snapshot of a clone to fail")
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone, source_version=1)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # ensure metadata file is in v2 location, with required version v2
+        self._assert_meta_location_and_version(self.volname, clone)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_clone_in_progress_getpath(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=64)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # clone should not be accessible right now
+        try:
+            self._get_subvolume_path(self.volname, clone)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EAGAIN:
+                raise RuntimeError("invalid error code when fetching path of an pending clone")
+        else:
+            raise RuntimeError("expected fetching path of an pending clone to fail")
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # clone should be accessible now
+        subvolpath = self._get_subvolume_path(self.volname, clone)
+        self.assertNotEqual(subvolpath, None)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_clone_in_progress_snapshot_rm(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=64)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # snapshot should not be deletable now
+        try:
+            self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EAGAIN, msg="invalid error code when removing source snapshot of a clone")
+        else:
+            self.fail("expected removing source snapshot of a clone to fail")
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # clone should be accessible now
+        subvolpath = self._get_subvolume_path(self.volname, clone)
+        self.assertNotEqual(subvolpath, None)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_clone_in_progress_source(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=64)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # verify clone source
+        result = json.loads(self._fs_cmd("clone", "status", self.volname, clone))
+        source = result['status']['source']
+        self.assertEqual(source['volume'], self.volname)
+        self.assertEqual(source['subvolume'], subvolume)
+        self.assertEqual(source.get('group', None), None)
+        self.assertEqual(source['snapshot'], snapshot)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # clone should be accessible now
+        subvolpath = self._get_subvolume_path(self.volname, clone)
+        self.assertNotEqual(subvolpath, None)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_non_clone_status(self):
+        subvolume = self._generate_random_subvolume_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        try:
+            self._fs_cmd("clone", "status", self.volname, subvolume)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOTSUP:
+                raise RuntimeError("invalid error code when fetching status of a non cloned subvolume")
+        else:
+            raise RuntimeError("expected fetching of clone status of a subvolume to fail")
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone_on_existing_subvolumes(self):
+        subvolume1, subvolume2 = self._generate_random_subvolume_name(2)
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolumes
+        self._fs_cmd("subvolume", "create", self.volname, subvolume1)
+        self._fs_cmd("subvolume", "create", self.volname, subvolume2)
+
+        # do some IO
+        self._do_subvolume_io(subvolume1, number_of_files=32)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume1, snapshot)
+
+        # schedule a clone with target as subvolume2
+        try:
+            self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume1, snapshot, subvolume2)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EEXIST:
+                raise RuntimeError("invalid error code when cloning to existing subvolume")
+        else:
+            raise RuntimeError("expected cloning to fail if the target is an existing subvolume")
+
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume1, snapshot, clone)
+
+        # schedule a clone with target as clone
+        try:
+            self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume1, snapshot, clone)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EEXIST:
+                raise RuntimeError("invalid error code when cloning to existing clone")
+        else:
+            raise RuntimeError("expected cloning to fail if the target is an existing clone")
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume1, snapshot, clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume1, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume1)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume2)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone_fail_with_remove(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone1, clone2 = self._generate_random_clone_name(2)
+
+        pool_capacity = 32 * 1024 * 1024
+        # number of files required to fill up 99% of the pool
+        nr_files = int((pool_capacity * 0.99) // (TestVolumes.DEFAULT_FILE_SIZE * 1024 * 1024))
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=nr_files)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # add data pool
+        new_pool = "new_pool"
+        self.fs.add_data_pool(new_pool)
+
+        self.fs.mon_manager.raw_cluster_cmd("osd", "pool", "set-quota", new_pool,
+                                            "max_bytes", "{0}".format(pool_capacity // 4))
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone1, "--pool_layout", new_pool)
+
+        # check clone status -- this should dramatically overshoot the pool quota
+        self._wait_for_clone_to_complete(clone1)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone1, clone_pool=new_pool)
+
+        # wait a bit so that subsequent I/O will give pool full error
+        time.sleep(120)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone2, "--pool_layout", new_pool)
+
+        # check clone status
+        self._wait_for_clone_to_fail(clone2)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone1)
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, clone2)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EAGAIN:
+                raise RuntimeError("invalid error code when trying to remove failed clone")
+        else:
+            raise RuntimeError("expected error when removing a failed clone")
+
+        #  ... and with force, failed clone can be removed
+        self._fs_cmd("subvolume", "rm", self.volname, clone2, "--force")
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_attr_clone(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io_mixed(subvolume)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone_cancel_in_progress(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=128)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # cancel on-going clone
+        self._fs_cmd("clone", "cancel", self.volname, clone)
+
+        # verify canceled state
+        self._check_clone_canceled(clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone, "--force")
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone_cancel_pending(self):
+        """
+        this test is a bit more involved compared to canceling an in-progress clone.
+        we'd need to ensure that a to-be canceled clone has still not been picked up
+        by cloner threads. exploit the fact that clones are picked up in an FCFS
+        fashion and there are four (4) cloner threads by default. When the number of
+        cloner threads increase, this test _may_ start tripping -- so, the number of
+        clone operations would need to be jacked up.
+        """
+        # default number of clone threads
+        NR_THREADS = 4
+        # good enough for 4 threads
+        NR_CLONES = 5
+        # yeh, 1gig -- we need the clone to run for sometime
+        FILE_SIZE_MB = 1024
+
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clones = self._generate_random_clone_name(NR_CLONES)
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=4, file_size=FILE_SIZE_MB)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # schedule clones
+        for clone in clones:
+            self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        to_wait = clones[0:NR_THREADS]
+        to_cancel = clones[NR_THREADS:]
+
+        # cancel pending clones and verify
+        for clone in to_cancel:
+            status = json.loads(self._fs_cmd("clone", "status", self.volname, clone))
+            self.assertEqual(status["status"]["state"], "pending")
+            self._fs_cmd("clone", "cancel", self.volname, clone)
+            self._check_clone_canceled(clone)
+
+        # let's cancel on-going clones. handle the case where some of the clones
+        # _just_ complete
+        for clone in list(to_wait):
+            try:
+                self._fs_cmd("clone", "cancel", self.volname, clone)
+                to_cancel.append(clone)
+                to_wait.remove(clone)
+            except CommandFailedError as ce:
+                if ce.exitstatus != errno.EINVAL:
+                    raise RuntimeError("invalid error code when cancelling on-going clone")
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        for clone in to_wait:
+            self._fs_cmd("subvolume", "rm", self.volname, clone)
+        for clone in to_cancel:
+            self._fs_cmd("subvolume", "rm", self.volname, clone, "--force")
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
diff --git a/qa/tasks/cephfs_test_runner.py b/qa/tasks/cephfs_test_runner.py
new file mode 100644
index 00000000..4455c086
--- /dev/null
+++ b/qa/tasks/cephfs_test_runner.py
@@ -0,0 +1,209 @@
+import contextlib
+import logging
+import os
+import unittest
+from unittest import suite, loader, case
+from teuthology.task import interactive
+from teuthology import misc
+from tasks.cephfs.filesystem import Filesystem, MDSCluster, CephCluster
+from tasks.mgr.mgr_test_case import MgrCluster
+
+log = logging.getLogger(__name__)
+
+
+class DecoratingLoader(loader.TestLoader):
+    """
+    A specialization of TestLoader that tags some extra attributes
+    onto test classes as they are loaded.
+    """
+    def __init__(self, params):
+        self._params = params
+        super(DecoratingLoader, self).__init__()
+
+    def _apply_params(self, obj):
+        for k, v in self._params.items():
+            setattr(obj, k, v)
+
+    def loadTestsFromTestCase(self, testCaseClass):
+        self._apply_params(testCaseClass)
+        return super(DecoratingLoader, self).loadTestsFromTestCase(testCaseClass)
+
+    def loadTestsFromName(self, name, module=None):
+        result = super(DecoratingLoader, self).loadTestsFromName(name, module)
+
+        # Special case for when we were called with the name of a method, we get
+        # a suite with one TestCase
+        tests_in_result = list(result)
+        if len(tests_in_result) == 1 and isinstance(tests_in_result[0], case.TestCase):
+            self._apply_params(tests_in_result[0])
+
+        return result
+
+
+class LogStream(object):
+    def __init__(self):
+        self.buffer = ""
+
+    def write(self, data):
+        self.buffer += data
+        if "\n" in self.buffer:
+            lines = self.buffer.split("\n")
+            for line in lines[:-1]:
+                log.info(line)
+            self.buffer = lines[-1]
+
+    def flush(self):
+        pass
+
+
+class InteractiveFailureResult(unittest.TextTestResult):
+    """
+    Specialization that implements interactive-on-error style
+    behavior.
+    """
+    ctx = None
+
+    def addFailure(self, test, err):
+        log.error(self._exc_info_to_string(err, test))
+        log.error("Failure in test '{0}', going interactive".format(
+            self.getDescription(test)
+        ))
+        interactive.task(ctx=self.ctx, config=None)
+
+    def addError(self, test, err):
+        log.error(self._exc_info_to_string(err, test))
+        log.error("Error in test '{0}', going interactive".format(
+            self.getDescription(test)
+        ))
+        interactive.task(ctx=self.ctx, config=None)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run the CephFS test cases.
+
+    Run everything in tasks/cephfs/test_*.py:
+
+    ::
+
+        tasks:
+          - install:
+          - ceph:
+          - ceph-fuse:
+          - cephfs_test_runner:
+
+    `modules` argument allows running only some specific modules:
+
+    ::
+
+        tasks:
+            ...
+          - cephfs_test_runner:
+              modules:
+                - tasks.cephfs.test_sessionmap
+                - tasks.cephfs.test_auto_repair
+
+    By default, any cases that can't be run on the current cluster configuration
+    will generate a failure.  When the optional `fail_on_skip` argument is set
+    to false, any tests that can't be run on the current configuration will
+    simply be skipped:
+
+    ::
+        tasks:
+            ...
+         - cephfs_test_runner:
+           fail_on_skip: false
+
+    """
+
+    ceph_cluster = CephCluster(ctx)
+
+    if len(list(misc.all_roles_of_type(ctx.cluster, 'mds'))):
+        mds_cluster = MDSCluster(ctx)
+        fs = Filesystem(ctx)
+    else:
+        mds_cluster = None
+        fs = None
+
+    if len(list(misc.all_roles_of_type(ctx.cluster, 'mgr'))):
+        mgr_cluster = MgrCluster(ctx)
+    else:
+        mgr_cluster = None
+
+    # Mount objects, sorted by ID
+    if hasattr(ctx, 'mounts'):
+        mounts = [v for k, v in sorted(ctx.mounts.items(), key=lambda mount: mount[0])]
+    else:
+        # The test configuration has a filesystem but no fuse/kclient mounts
+        mounts = []
+
+    decorating_loader = DecoratingLoader({
+        "ctx": ctx,
+        "mounts": mounts,
+        "fs": fs,
+        "ceph_cluster": ceph_cluster,
+        "mds_cluster": mds_cluster,
+        "mgr_cluster": mgr_cluster,
+    })
+
+    fail_on_skip = config.get('fail_on_skip', True)
+
+    # Put useful things onto ctx for interactive debugging
+    ctx.fs = fs
+    ctx.mds_cluster = mds_cluster
+    ctx.mgr_cluster = mgr_cluster
+
+    # Depending on config, either load specific modules, or scan for moduless
+    if config and 'modules' in config and config['modules']:
+        module_suites = []
+        for mod_name in config['modules']:
+            # Test names like cephfs.test_auto_repair
+            module_suites.append(decorating_loader.loadTestsFromName(mod_name))
+        overall_suite = suite.TestSuite(module_suites)
+    else:
+        # Default, run all tests
+        overall_suite = decorating_loader.discover(
+            os.path.join(
+                os.path.dirname(os.path.abspath(__file__)),
+                "cephfs/"
+            )
+        )
+
+    if ctx.config.get("interactive-on-error", False):
+        InteractiveFailureResult.ctx = ctx
+        result_class = InteractiveFailureResult
+    else:
+        result_class = unittest.TextTestResult
+
+    class LoggingResult(result_class):
+        def startTest(self, test):
+            log.info("Starting test: {0}".format(self.getDescription(test)))
+            return super(LoggingResult, self).startTest(test)
+
+        def addSkip(self, test, reason):
+            if fail_on_skip:
+                # Don't just call addFailure because that requires a traceback
+                self.failures.append((test, reason))
+            else:
+                super(LoggingResult, self).addSkip(test, reason)
+
+    # Execute!
+    result = unittest.TextTestRunner(
+        stream=LogStream(),
+        resultclass=LoggingResult,
+        verbosity=2,
+        failfast=True).run(overall_suite)
+
+    if not result.wasSuccessful():
+        result.printErrors()  # duplicate output at end for convenience
+
+        bad_tests = []
+        for test, error in result.errors:
+            bad_tests.append(str(test))
+        for test, failure in result.failures:
+            bad_tests.append(str(test))
+
+        raise RuntimeError("Test failure: {0}".format(", ".join(bad_tests)))
+
+    yield
diff --git a/qa/tasks/cephfs_upgrade_snap.py b/qa/tasks/cephfs_upgrade_snap.py
new file mode 100644
index 00000000..1708d43c
--- /dev/null
+++ b/qa/tasks/cephfs_upgrade_snap.py
@@ -0,0 +1,45 @@
+"""
+Upgrade cluster snap format.
+"""
+
+import logging
+import time
+
+from tasks.cephfs.filesystem import Filesystem
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Upgrade CephFS file system snap format.
+    """
+
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'snap-upgrade task only accepts a dict for configuration'
+
+    fs = Filesystem(ctx)
+
+    mds_map = fs.get_mds_map()
+    assert(mds_map['max_mds'] == 1)
+
+    json = fs.rank_tell(["scrub", "start", "/", "force", "recursive", "repair"])
+    if not json or json['return_code'] == 0:
+        log.info("scrub / completed")
+    else:
+        log.info("scrub / failed: {}".format(json))
+
+    json = fs.rank_tell(["scrub", "start", "~mdsdir", "force", "recursive", "repair"])
+    if not json or json['return_code'] == 0:
+        log.info("scrub ~mdsdir completed")
+    else:
+        log.info("scrub / failed: {}".format(json))
+
+    for i in range(0, 10):
+        mds_map = fs.get_mds_map()
+        if (mds_map['flags'] & (1<<1)) != 0 and (mds_map['flags'] & (1<<4)) != 0:
+            break
+        time.sleep(10)
+    assert((mds_map['flags'] & (1<<1)) != 0) # Test CEPH_MDSMAP_ALLOW_SNAPS
+    assert((mds_map['flags'] & (1<<4)) != 0) # Test CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS