226 files changed, 71052 insertions, 0 deletions
diff --git a/qa/tasks/__init__.py b/qa/tasks/__init__.py
new file mode 100644
index 000000000..9a7949a00
--- /dev/null
+++ b/qa/tasks/__init__.py
@@ -0,0 +1,6 @@
+import logging
+
+# Inherit teuthology's log level
+teuthology_log = logging.getLogger('teuthology')
+log = logging.getLogger(__name__)
+log.setLevel(teuthology_log.level)
diff --git a/qa/tasks/admin_socket.py b/qa/tasks/admin_socket.py
new file mode 100644
index 000000000..c454d3d0c
--- /dev/null
+++ b/qa/tasks/admin_socket.py
@@ -0,0 +1,194 @@
+"""
+Admin Socket task -- used in rados, powercycle, and smoke testing
+"""
+
+import json
+import logging
+import os
+import time
+
+from teuthology.exceptions import CommandFailedError
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+from teuthology.parallel import parallel
+from teuthology.config import config as teuth_config
+
+log = logging.getLogger(__name__)
+
+
+def task(ctx, config):
+    """
+    Run an admin socket command, make sure the output is json, and run
+    a test program on it. The test program should read json from
+    stdin. This task succeeds if the test program exits with status 0.
+
+    To run the same test on all clients::
+
+        tasks:
+        - ceph:
+        - rados:
+        - admin_socket:
+            all:
+              dump_requests:
+                test: http://example.com/script
+
+    To restrict it to certain clients::
+
+        tasks:
+        - ceph:
+        - rados: [client.1]
+        - admin_socket:
+            client.1:
+              dump_requests:
+                test: http://example.com/script
+
+    If an admin socket command has arguments, they can be specified as
+    a list::
+
+        tasks:
+        - ceph:
+        - rados: [client.0]
+        - admin_socket:
+            client.0:
+              dump_requests:
+                test: http://example.com/script
+              help:
+                test: http://example.com/test_help_version
+                args: [version]
+
+    Note that there must be a ceph client with an admin socket running
+    before this task is run. The tests are parallelized at the client
+    level. Tests for a single client are run serially.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    assert isinstance(config, dict), \
+        'admin_socket task requires a dict for configuration'
+    teuthology.replace_all_with_clients(ctx.cluster, config)
+
+    with parallel() as ptask:
+        for client, tests in config.items():
+            ptask.spawn(_run_tests, ctx, client, tests)
+
+
+def _socket_command(ctx, remote, socket_path, command, args):
+    """
+    Run an admin socket command and return the result as a string.
+
+    :param ctx: Context
+    :param remote: Remote site
+    :param socket_path: path to socket
+    :param command: command to be run remotely
+    :param args: command arguments
+
+    :returns: output of command in json format
+    """
+    testdir = teuthology.get_testdir(ctx)
+    max_tries = 120
+    while True:
+        try:
+            out = remote.sh([
+                'sudo',
+                'adjust-ulimits',
+                'ceph-coverage',
+                '{tdir}/archive/coverage'.format(tdir=testdir),
+                'ceph',
+                '--admin-daemon', socket_path,
+                ] + command.split(' ') + args)
+        except CommandFailedError:
+            assert max_tries > 0
+            max_tries -= 1
+            log.info('ceph cli returned an error, command not registered yet?')
+            log.info('sleeping and retrying ...')
+            time.sleep(1)
+            continue
+        break
+    log.debug('admin socket command %s returned %s', command, out)
+    return json.loads(out)
+
+def _run_tests(ctx, client, tests):
+    """
+    Create a temp directory and wait for a client socket to be created.
+    For each test, copy the executable locally and run the test.
+    Remove temp directory when finished.
+
+    :param ctx: Context
+    :param client: client machine to run the test
+    :param tests: list of tests to run
+    """
+    testdir = teuthology.get_testdir(ctx)
+    log.debug('Running admin socket tests on %s', client)
+    (remote,) = ctx.cluster.only(client).remotes.keys()
+    socket_path = '/var/run/ceph/ceph-{name}.asok'.format(name=client)
+    overrides = ctx.config.get('overrides', {}).get('admin_socket', {})
+
+    try:
+        tmp_dir = os.path.join(
+            testdir,
+            'admin_socket_{client}'.format(client=client),
+            )
+        remote.run(
+            args=[
+                'mkdir',
+                '--',
+                tmp_dir,
+                run.Raw('&&'),
+                # wait for client process to create the socket
+                'while', 'test', '!', '-e', socket_path, run.Raw(';'),
+                'do', 'sleep', '1', run.Raw(';'), 'done',
+                ],
+            )
+
+        for command, config in tests.items():
+            if config is None:
+                config = {}
+            teuthology.deep_merge(config, overrides)
+            log.debug('Testing %s with config %s', command, str(config))
+
+            test_path = None
+            if 'test' in config:
+                # hack: the git_url is always ceph-ci or ceph
+                git_url = teuth_config.get_ceph_git_url()
+                repo_name = 'ceph.git'
+                if git_url.count('ceph-ci'):
+                    repo_name = 'ceph-ci.git'
+                url = config['test'].format(
+                    branch=config.get('branch', 'master'),
+                    repo=repo_name,
+                    )
+                test_path = os.path.join(tmp_dir, command)
+                remote.run(
+                    args=[
+                        'wget',
+                        '-q',
+                        '-O',
+                        test_path,
+                        '--',
+                        url,
+                        run.Raw('&&'),
+                        'chmod',
+                        'u=rx',
+                        '--',
+                        test_path,
+                        ],
+                    )
+
+            args = config.get('args', [])
+            assert isinstance(args, list), \
+                'admin socket command args must be a list'
+            sock_out = _socket_command(ctx, remote, socket_path, command, args)
+            if test_path is not None:
+                remote.run(
+                    args=[
+                        test_path,
+                        ],
+                    stdin=json.dumps(sock_out),
+                    )
+
+    finally:
+        remote.run(
+            args=[
+                'rm', '-rf', '--', tmp_dir,
+                ],
+            )
diff --git a/qa/tasks/autotest.py b/qa/tasks/autotest.py
new file mode 100644
index 000000000..80c3fc9d2
--- /dev/null
+++ b/qa/tasks/autotest.py
@@ -0,0 +1,165 @@
+"""
+Run an autotest test on the ceph cluster.
+"""
+import json
+import logging
+import os
+
+from teuthology import misc as teuthology
+from teuthology.parallel import parallel
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Run an autotest test on the ceph cluster.
+
+    Only autotest client tests are supported.
+
+    The config is a mapping from role name to list of tests to run on
+    that client.
+
+    For example::
+
+        tasks:
+        - ceph:
+        - ceph-fuse: [client.0, client.1]
+        - autotest:
+            client.0: [dbench]
+            client.1: [bonnie]
+
+    You can also specify a list of tests to run on all clients::
+
+        tasks:
+        - ceph:
+        - ceph-fuse:
+        - autotest:
+            all: [dbench]
+    """
+    assert isinstance(config, dict)
+    config = teuthology.replace_all_with_clients(ctx.cluster, config)
+    log.info('Setting up autotest...')
+    testdir = teuthology.get_testdir(ctx)
+    with parallel() as p:
+        for role in config.keys():
+            (remote,) = ctx.cluster.only(role).remotes.keys()
+            p.spawn(_download, testdir, remote)
+
+    log.info('Making a separate scratch dir for every client...')
+    for role in config.keys():
+        assert isinstance(role, str)
+        PREFIX = 'client.'
+        assert role.startswith(PREFIX)
+        id_ = role[len(PREFIX):]
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+        mnt = os.path.join(testdir, 'mnt.{id}'.format(id=id_))
+        scratch = os.path.join(mnt, 'client.{id}'.format(id=id_))
+        remote.run(
+            args=[
+                'sudo',
+                'install',
+                '-d',
+                '-m', '0755',
+                '--owner={user}'.format(user='ubuntu'), #TODO
+                '--',
+                scratch,
+                ],
+            )
+
+    with parallel() as p:
+        for role, tests in config.items():
+            (remote,) = ctx.cluster.only(role).remotes.keys()
+            p.spawn(_run_tests, testdir, remote, role, tests)
+
+def _download(testdir, remote):
+    """
+    Download.  Does not explicitly support muliple tasks in a single run.
+    """
+    remote.run(
+        args=[
+            # explicitly does not support multiple autotest tasks
+            # in a single run; the result archival would conflict
+            'mkdir', '{tdir}/archive/autotest'.format(tdir=testdir),
+            run.Raw('&&'),
+            'mkdir', '{tdir}/autotest'.format(tdir=testdir),
+            run.Raw('&&'),
+            'wget',
+            '-nv',
+            '--no-check-certificate',
+            'https://github.com/ceph/autotest/tarball/ceph',
+            '-O-',
+            run.Raw('|'),
+            'tar',
+            '-C', '{tdir}/autotest'.format(tdir=testdir),
+            '-x',
+            '-z',
+            '-f-',
+            '--strip-components=1',
+            ],
+        )
+
+def _run_tests(testdir, remote, role, tests):
+    """
+    Spawned to run test on remote site
+    """
+    assert isinstance(role, str)
+    PREFIX = 'client.'
+    assert role.startswith(PREFIX)
+    id_ = role[len(PREFIX):]
+    mnt = os.path.join(testdir, 'mnt.{id}'.format(id=id_))
+    scratch = os.path.join(mnt, 'client.{id}'.format(id=id_))
+
+    assert isinstance(tests, list)
+    for idx, testname in enumerate(tests):
+        log.info('Running autotest client test #%d: %s...', idx, testname)
+
+        tag = 'client.{id}.num{idx}.{testname}'.format(
+            idx=idx,
+            testname=testname,
+            id=id_,
+            )
+        control = '{tdir}/control.{tag}'.format(tdir=testdir, tag=tag)
+        remote.write_file(
+            path=control,
+            data='import json; data=json.loads({data!r}); job.run_test(**data)'.format(
+                data=json.dumps(dict(
+                        url=testname,
+                        dir=scratch,
+                        # TODO perhaps tag
+                        # results will be in {testdir}/autotest/client/results/dbench
+                        # or {testdir}/autotest/client/results/dbench.{tag}
+                        )),
+                ),
+            )
+        remote.run(
+            args=[
+                '{tdir}/autotest/client/bin/autotest'.format(tdir=testdir),
+                '--verbose',
+                '--harness=simple',
+                '--tag={tag}'.format(tag=tag),
+                control,
+                run.Raw('3>&1'),
+                ],
+            )
+
+        remote.run(
+            args=[
+                'rm', '-rf', '--', control,
+                ],
+            )
+
+        remote.run(
+            args=[
+                'mv',
+                '--',
+                '{tdir}/autotest/client/results/{tag}'.format(tdir=testdir, tag=tag),
+                '{tdir}/archive/autotest/{tag}'.format(tdir=testdir, tag=tag),
+                ],
+            )
+
+    remote.run(
+        args=[
+            'rm', '-rf', '--', '{tdir}/autotest'.format(tdir=testdir),
+            ],
+        )
diff --git a/qa/tasks/aver.py b/qa/tasks/aver.py
new file mode 100644
index 000000000..79ee18c5c
--- /dev/null
+++ b/qa/tasks/aver.py
@@ -0,0 +1,67 @@
+"""
+Aver wrapper task
+"""
+import contextlib
+import logging
+from subprocess import check_call, Popen, PIPE
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Execute an aver assertion
+
+    Parameters:
+
+        input: file containing data referred to by the assertions. File name is
+               relative to the job's archive path
+        validations: list of validations in the Aver language
+
+    Example:
+    - aver:
+        input: bench_output.csv
+        validations:
+        - expect performance(alg='ceph') > performance(alg='raw')
+        - for size > 3 expect avg_throughput > 2000
+    """
+    log.info('Beginning aver...')
+    assert isinstance(config, dict), 'expecting dictionary for configuration'
+
+    if 'input' not in config:
+        raise Exception("Expecting 'input' option")
+    if len(config.get('validations', [])) < 1:
+        raise Exception("Expecting at least one entry in 'validations'")
+
+    url = ('https://github.com/ivotron/aver/releases/download/'
+           'v0.3.0/aver-linux-amd64.tar.bz2')
+
+    aver_path = ctx.archive + '/aver'
+
+    # download binary
+    check_call(['wget', '-O', aver_path + '.tbz', url])
+    check_call(['tar', 'xfj', aver_path + '.tbz', '-C', ctx.archive])
+
+    # print version
+    process = Popen([aver_path, '-v'], stdout=PIPE)
+    log.info(process.communicate()[0])
+
+    # validate
+    for validation in config['validations']:
+        cmd = (aver_path + ' -s -i ' + (ctx.archive + '/' + config['input']) +
+               ' "' + validation + '"')
+        log.info("executing: " + cmd)
+        process = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
+        (stdout, stderr) = process.communicate()
+        if stderr:
+            log.info('aver stderr: ' + stderr)
+        log.info('aver result: ' + stdout)
+        if stdout.strip(' \t\n\r') != 'true':
+            raise Exception('Failed validation: ' + validation)
+
+    try:
+        yield
+    finally:
+        log.info('Removing aver binary...')
+        check_call(['rm', aver_path, aver_path + '.tbz'])
diff --git a/qa/tasks/backfill_toofull.py b/qa/tasks/backfill_toofull.py
new file mode 100644
index 000000000..f4ff90a46
--- /dev/null
+++ b/qa/tasks/backfill_toofull.py
@@ -0,0 +1,193 @@
+"""
+Backfill_toofull
+"""
+import logging
+import time
+from tasks import ceph_manager
+from tasks.util.rados import rados
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+def wait_for_pg_state(manager, pgid, state, to_osd):
+    log.debug("waiting for pg %s state is %s" % (pgid, state))
+    for i in range(300):
+        time.sleep(5)
+        manager.flush_pg_stats([0, 1, 2, 3])
+        pgs = manager.get_pg_stats()
+        pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
+        log.info('pg=%s' % pg);
+        assert pg
+        status = pg['state'].split('+')
+        if 'active' not in status:
+            log.debug('not active')
+            continue
+        if state not in status:
+            log.debug('not %s' % state)
+            continue
+        assert to_osd in pg['up']
+        return
+    assert False, '%s not in %s' % (pgid, state)
+
+
+def task(ctx, config):
+    """
+    Test backfill reservation calculates "toofull" condition correctly.
+
+    A pretty rigid cluster is brought up and tested by this task
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'backfill_toofull task only accepts a dict for configuration'
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    profile = config.get('erasure_code_profile', {
+        'k': '2',
+        'm': '1',
+        'crush-failure-domain': 'osd'
+    })
+    profile_name = profile.get('name', 'backfill_toofull')
+    manager.create_erasure_code_profile(profile_name, profile)
+    pool = manager.create_pool_with_unique_name(
+        pg_num=1,
+        erasure_code_profile_name=profile_name,
+        min_size=2)
+    manager.raw_cluster_cmd('osd', 'pool', 'set', pool,
+                            'pg_autoscale_mode', 'off')
+
+    manager.flush_pg_stats([0, 1, 2, 3])
+    manager.wait_for_clean()
+
+    pool_id = manager.get_pool_num(pool)
+    pgid = '%d.0' % pool_id
+    pgs = manager.get_pg_stats()
+    acting = next((pg['acting'] for pg in pgs if pg['pgid'] == pgid), None)
+    log.debug("acting=%s" % acting)
+    assert acting
+    primary = acting[0]
+    target = acting[1]
+
+    log.debug("write some data")
+    rados(ctx, mon, ['-p', pool, 'bench', '120', 'write', '--no-cleanup'])
+    df = manager.get_osd_df(target)
+    log.debug("target osd df: %s" % df)
+
+    total_kb = df['kb']
+    used_kb = df['kb_used']
+
+    log.debug("pause recovery")
+    manager.raw_cluster_cmd('osd', 'set', 'noout')
+    manager.raw_cluster_cmd('osd', 'set', 'nobackfill')
+    manager.raw_cluster_cmd('osd', 'set', 'norecover')
+
+    log.debug("stop tartget osd %s" % target)
+    manager.kill_osd(target)
+    manager.wait_till_active()
+
+    pgs = manager.get_pg_stats()
+    pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
+    log.debug('pg=%s' % pg)
+    assert pg
+
+    log.debug("re-write data")
+    rados(ctx, mon, ['-p', pool, 'cleanup'])
+    time.sleep(10)
+    rados(ctx, mon, ['-p', pool, 'bench', '60', 'write', '--no-cleanup'])
+
+    df = manager.get_osd_df(primary)
+    log.debug("primary osd df: %s" % df)
+
+    primary_used_kb = df['kb_used']
+
+    log.info("test backfill reservation rejected with toofull")
+
+    # We set backfillfull ratio less than new data size and expect the pg
+    # entering backfill_toofull state.
+    #
+    # We also need to update nearfull ratio to prevent "full ratio(s) out of order".
+
+    backfillfull = 0.9 * primary_used_kb / total_kb
+    nearfull = backfillfull * 0.9
+
+    log.debug("update nearfull ratio to %s and backfillfull ratio to %s" %
+              (nearfull, backfillfull))
+    manager.raw_cluster_cmd('osd', 'set-nearfull-ratio',
+                            '{:.3f}'.format(nearfull + 0.001))
+    manager.raw_cluster_cmd('osd', 'set-backfillfull-ratio',
+                            '{:.3f}'.format(backfillfull + 0.001))
+
+    log.debug("start tartget osd %s" % target)
+
+    manager.revive_osd(target)
+    manager.wait_for_active()
+    manager.wait_till_osd_is_up(target)
+
+    wait_for_pg_state(manager, pgid, 'backfill_toofull', target)
+
+    log.info("test pg not enter backfill_toofull after restarting backfill")
+
+    # We want to set backfillfull ratio to be big enough for the target to
+    # successfully backfill new data but smaller than the sum of old and new
+    # data, so if the osd backfill reservation incorrectly calculates "toofull"
+    # the test will detect this (fail).
+    #
+    # Note, we need to operate with "uncompressed" bytes because currently
+    # osd backfill reservation does not take compression into account.
+    #
+    # We also need to update nearfull ratio to prevent "full ratio(s) out of order".
+
+    pdf = manager.get_pool_df(pool)
+    log.debug("pool %s df: %s" % (pool, pdf))
+    assert pdf
+    compress_ratio = 1.0 * pdf['compress_under_bytes'] / pdf['compress_bytes_used'] \
+        if pdf['compress_bytes_used'] > 0 else 1.0
+    log.debug("compress_ratio: %s" % compress_ratio)
+
+    backfillfull = (used_kb + primary_used_kb) * compress_ratio / total_kb
+    assert backfillfull < 0.9
+    nearfull_min = max(used_kb, primary_used_kb) * compress_ratio / total_kb
+    assert nearfull_min < backfillfull
+    delta = backfillfull - nearfull_min
+    nearfull = nearfull_min + delta * 0.1
+    backfillfull = nearfull_min + delta * 0.2
+
+    log.debug("update nearfull ratio to %s and backfillfull ratio to %s" %
+              (nearfull, backfillfull))
+    manager.raw_cluster_cmd('osd', 'set-nearfull-ratio',
+                            '{:.3f}'.format(nearfull + 0.001))
+    manager.raw_cluster_cmd('osd', 'set-backfillfull-ratio',
+                            '{:.3f}'.format(backfillfull + 0.001))
+
+    wait_for_pg_state(manager, pgid, 'backfilling', target)
+
+    pgs = manager.get_pg_stats()
+    pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
+    log.debug('pg=%s' % pg)
+    assert pg
+
+    log.debug("interrupt %s backfill" % target)
+    manager.mark_down_osd(target)
+    # after marking the target osd down it will automatically be
+    # up soon again
+
+    log.debug("resume recovery")
+    manager.raw_cluster_cmd('osd', 'unset', 'noout')
+    manager.raw_cluster_cmd('osd', 'unset', 'nobackfill')
+    manager.raw_cluster_cmd('osd', 'unset', 'norecover')
+
+    # wait for everything to peer, backfill and recover
+    manager.wait_for_clean()
+
+    pgs = manager.get_pg_stats()
+    pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
+    log.info('pg=%s' % pg)
+    assert pg
+    assert 'clean' in pg['state'].split('+')
diff --git a/qa/tasks/barbican.py b/qa/tasks/barbican.py
new file mode 100644
index 000000000..d43568c61
--- /dev/null
+++ b/qa/tasks/barbican.py
@@ -0,0 +1,524 @@
+"""
+Deploy and configure Barbican for Teuthology
+"""
+import argparse
+import contextlib
+import logging
+import http
+import json
+import time
+import math
+
+from urllib.parse import urlparse
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.orchestra import run
+from teuthology.exceptions import ConfigError
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def download(ctx, config):
+    """
+    Download the Barbican from github.
+    Remove downloaded file upon exit.
+
+    The context passed in should be identical to the context
+    passed in to the main task.
+    """
+    assert isinstance(config, dict)
+    log.info('Downloading barbican...')
+    testdir = teuthology.get_testdir(ctx)
+    for (client, cconf) in config.items():
+        branch = cconf.get('force-branch', 'master')
+        log.info("Using branch '%s' for barbican", branch)
+
+        sha1 = cconf.get('sha1')
+        log.info('sha1=%s', sha1)
+
+        ctx.cluster.only(client).run(
+            args=[
+                'bash', '-l'
+                ],
+            )
+        ctx.cluster.only(client).run(
+            args=[
+                'git', 'clone',
+                '-b', branch,
+                'https://github.com/openstack/barbican.git',
+                '{tdir}/barbican'.format(tdir=testdir),
+                ],
+            )
+        if sha1 is not None:
+            ctx.cluster.only(client).run(
+                args=[
+                    'cd', '{tdir}/barbican'.format(tdir=testdir),
+                    run.Raw('&&'),
+                    'git', 'reset', '--hard', sha1,
+                    ],
+                )
+    try:
+        yield
+    finally:
+        log.info('Removing barbican...')
+        testdir = teuthology.get_testdir(ctx)
+        for client in config:
+            ctx.cluster.only(client).run(
+                args=[
+                    'rm',
+                    '-rf',
+                    '{tdir}/barbican'.format(tdir=testdir),
+                    ],
+                )
+
+def get_barbican_dir(ctx):
+    return '{tdir}/barbican'.format(tdir=teuthology.get_testdir(ctx))
+
+def run_in_barbican_dir(ctx, client, args):
+    ctx.cluster.only(client).run(
+        args=['cd', get_barbican_dir(ctx), run.Raw('&&'), ] + args,
+    )
+
+def run_in_barbican_venv(ctx, client, args):
+    run_in_barbican_dir(ctx, client,
+                        ['.',
+                         '.barbicanenv/bin/activate',
+                         run.Raw('&&')
+                        ] + args)
+
+@contextlib.contextmanager
+def setup_venv(ctx, config):
+    """
+    Setup the virtualenv for Barbican using pip.
+    """
+    assert isinstance(config, dict)
+    log.info('Setting up virtualenv for barbican...')
+    for (client, _) in config.items():
+        run_in_barbican_dir(ctx, client,
+                            ['python3', '-m', 'venv', '.barbicanenv'])
+        run_in_barbican_venv(ctx, client,
+                             ['pip', 'install', '--upgrade', 'pip'])
+        run_in_barbican_venv(ctx, client,
+                             ['pip', 'install', 'pytz',
+                              '-e', get_barbican_dir(ctx)])
+    yield
+
+def assign_ports(ctx, config, initial_port):
+    """
+    Assign port numbers starting from @initial_port
+    """
+    port = initial_port
+    role_endpoints = {}
+    for remote, roles_for_host in ctx.cluster.remotes.items():
+        for role in roles_for_host:
+            if role in config:
+                role_endpoints[role] = (remote.name.split('@')[1], port)
+                port += 1
+
+    return role_endpoints
+
+def set_authtoken_params(ctx, cclient, cconfig):
+    section_config_list = cconfig['keystone_authtoken'].items()
+    for config in section_config_list:
+        (name, val) = config
+        run_in_barbican_dir(ctx, cclient,
+                            ['sed', '-i',
+                             '/[[]filter:authtoken]/{p;s##'+'{} = {}'.format(name, val)+'#;}',
+                             'etc/barbican/barbican-api-paste.ini'])
+
+    keystone_role = cconfig.get('use-keystone-role', None)
+    public_host, public_port = ctx.keystone.public_endpoints[keystone_role]
+    url = 'http://{host}:{port}/v3'.format(host=public_host,
+                                           port=public_port)
+    run_in_barbican_dir(ctx, cclient,
+                        ['sed', '-i',
+                         '/[[]filter:authtoken]/{p;s##'+'auth_uri = {}'.format(url)+'#;}',
+                         'etc/barbican/barbican-api-paste.ini'])
+    admin_host, admin_port = ctx.keystone.admin_endpoints[keystone_role]
+    admin_url = 'http://{host}:{port}/v3'.format(host=admin_host,
+                                                 port=admin_port)
+    run_in_barbican_dir(ctx, cclient,
+                        ['sed', '-i',
+                         '/[[]filter:authtoken]/{p;s##'+'auth_url = {}'.format(admin_url)+'#;}',
+                         'etc/barbican/barbican-api-paste.ini'])
+
+def fix_barbican_api_paste(ctx, cclient):
+    run_in_barbican_dir(ctx, cclient,
+                        ['sed', '-i', '-n',
+                         '/\\[pipeline:barbican_api]/ {p;n; /^pipeline =/ '+
+                         '{ s/.*/pipeline = unauthenticated-context apiapp/;p;d } } ; p',
+                         './etc/barbican/barbican-api-paste.ini'])
+
+def fix_barbican_api(ctx, cclient):
+    run_in_barbican_dir(ctx, cclient,
+                        ['sed', '-i',
+                         '/prop_dir =/ s#etc/barbican#{}/etc/barbican#'.format(get_barbican_dir(ctx)),
+                         'bin/barbican-api'])
+
+def copy_policy_json(ctx, cclient):
+    run_in_barbican_dir(ctx, cclient,
+                        ['cp',
+                         get_barbican_dir(ctx)+'/etc/barbican/policy.json',
+                         get_barbican_dir(ctx)])
+
+def create_barbican_conf(ctx, cclient):
+    barbican_host, barbican_port = ctx.barbican.endpoints[cclient]
+    barbican_url = 'http://{host}:{port}'.format(host=barbican_host,
+                                                 port=barbican_port)
+    log.info("barbican url=%s", barbican_url)
+
+    run_in_barbican_dir(ctx, cclient,
+                        ['bash', '-c',
+                         'echo -n -e "[DEFAULT]\nhost_href=' + barbican_url + '\n" ' + \
+                         '>barbican.conf'])
+
+@contextlib.contextmanager
+def configure_barbican(ctx, config):
+    """
+    Configure barbican paste-api and barbican-api.
+    """
+    assert isinstance(config, dict)
+    (cclient, cconfig) = next(iter(config.items()))
+
+    keystone_role = cconfig.get('use-keystone-role', None)
+    if keystone_role is None:
+        raise ConfigError('use-keystone-role not defined in barbican task')
+
+    set_authtoken_params(ctx, cclient, cconfig)
+    fix_barbican_api(ctx, cclient)
+    fix_barbican_api_paste(ctx, cclient)
+    copy_policy_json(ctx, cclient)
+    create_barbican_conf(ctx, cclient)
+    try:
+        yield
+    finally:
+        pass
+
+@contextlib.contextmanager
+def run_barbican(ctx, config):
+    assert isinstance(config, dict)
+    log.info('Running barbican...')
+
+    for (client, _) in config.items():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        cluster_name, _, client_id = teuthology.split_role(client)
+
+        # start the public endpoint
+        client_public_with_id = 'barbican.public' + '.' + client_id
+
+        run_cmd = ['cd', get_barbican_dir(ctx), run.Raw('&&'),
+                   '.', '.barbicanenv/bin/activate', run.Raw('&&'),
+                   'HOME={}'.format(get_barbican_dir(ctx)), run.Raw('&&'),
+                   'bin/barbican-api',
+                   run.Raw('& { read; kill %1; }')]
+                   #run.Raw('1>/dev/null')
+
+        run_cmd = 'cd ' + get_barbican_dir(ctx) + ' && ' + \
+                  '. .barbicanenv/bin/activate && ' + \
+                  'HOME={}'.format(get_barbican_dir(ctx)) + ' && ' + \
+                  'exec bin/barbican-api & { read; kill %1; }'
+
+        ctx.daemons.add_daemon(
+            remote, 'barbican', client_public_with_id,
+            cluster=cluster_name,
+            args=['bash', '-c', run_cmd],
+            logger=log.getChild(client),
+            stdin=run.PIPE,
+            cwd=get_barbican_dir(ctx),
+            wait=False,
+            check_status=False,
+        )
+
+        # sleep driven synchronization
+        run_in_barbican_venv(ctx, client, ['sleep', '15'])
+    try:
+        yield
+    finally:
+        log.info('Stopping Barbican instance')
+        ctx.daemons.get_daemon('barbican', client_public_with_id,
+                               cluster_name).stop()
+
+
+@contextlib.contextmanager
+def create_secrets(ctx, config):
+    """
+    Create a main and an alternate s3 user.
+    """
+    assert isinstance(config, dict)
+    (cclient, cconfig) = next(iter(config.items()))
+
+    rgw_user = cconfig['rgw_user']
+
+    keystone_role = cconfig.get('use-keystone-role', None)
+    keystone_host, keystone_port = ctx.keystone.public_endpoints[keystone_role]
+    barbican_host, barbican_port = ctx.barbican.endpoints[cclient]
+    barbican_url = 'http://{host}:{port}'.format(host=barbican_host,
+                                                 port=barbican_port)
+    log.info("barbican_url=%s", barbican_url)
+    #fetching user_id of user that gets secrets for radosgw
+    token_req = http.client.HTTPConnection(keystone_host, keystone_port, timeout=30)
+    token_req.request(
+        'POST',
+        '/v3/auth/tokens',
+        headers={'Content-Type':'application/json'},
+        body=json.dumps({
+            "auth": {
+                "identity": {
+                    "methods": ["password"],
+                    "password": {
+                        "user": {
+                            "domain": {"id": "default"},
+                            "name": rgw_user["username"],
+                            "password": rgw_user["password"]
+                        }
+                    }
+                },
+                "scope": {
+                    "project": {
+                        "domain": {"id": "default"},
+                        "name": rgw_user["tenantName"]
+                    }
+                }
+            }
+        }))
+    rgw_access_user_resp = token_req.getresponse()
+    if not (rgw_access_user_resp.status >= 200 and
+            rgw_access_user_resp.status < 300):
+        raise Exception("Cannot authenticate user "+rgw_user["username"]+" for secret creation")
+    #    baru_resp = json.loads(baru_req.data)
+    rgw_access_user_data = json.loads(rgw_access_user_resp.read().decode())
+    rgw_user_id = rgw_access_user_data['token']['user']['id']
+    if 'secrets' in cconfig:
+        for secret in cconfig['secrets']:
+            if 'name' not in secret:
+                raise ConfigError('barbican.secrets must have "name" field')
+            if 'base64' not in secret:
+                raise ConfigError('barbican.secrets must have "base64" field')
+            if 'tenantName' not in secret:
+                raise ConfigError('barbican.secrets must have "tenantName" field')
+            if 'username' not in secret:
+                raise ConfigError('barbican.secrets must have "username" field')
+            if 'password' not in secret:
+                raise ConfigError('barbican.secrets must have "password" field')
+
+            token_req = http.client.HTTPConnection(keystone_host, keystone_port, timeout=30)
+            token_req.request(
+                'POST',
+                '/v3/auth/tokens',
+                headers={'Content-Type':'application/json'},
+                body=json.dumps({
+                    "auth": {
+                        "identity": {
+                            "methods": ["password"],
+                            "password": {
+                                "user": {
+                                    "domain": {"id": "default"},
+                                    "name": secret["username"],
+                                    "password": secret["password"]
+                                }
+                            }
+                        },
+                        "scope": {
+                            "project": {
+                                "domain": {"id": "default"},
+                                "name": secret["tenantName"]
+                            }
+                        }
+                    }
+                }))
+            token_resp = token_req.getresponse()
+            if not (token_resp.status >= 200 and
+                    token_resp.status < 300):
+                raise Exception("Cannot authenticate user "+secret["username"]+" for secret creation")
+
+            expire = time.time() + 5400		# now + 90m
+            (expire_fract,dummy) = math.modf(expire)
+            expire_format = "%%FT%%T.%06d" % (round(expire_fract*1000000))
+            expiration = time.strftime(expire_format, time.gmtime(expire))
+            token_id = token_resp.getheader('x-subject-token')
+
+            key1_json = json.dumps(
+                {
+                    "name": secret['name'],
+                    "expiration": expiration,
+                    "algorithm": "aes",
+                    "bit_length": 256,
+                    "mode": "cbc",
+                    "payload": secret['base64'],
+                    "payload_content_type": "application/octet-stream",
+                    "payload_content_encoding": "base64"
+                })
+
+            sec_req = http.client.HTTPConnection(barbican_host, barbican_port, timeout=30)
+            try:
+                sec_req.request(
+                    'POST',
+                    '/v1/secrets',
+                    headers={'Content-Type': 'application/json',
+                             'Accept': '*/*',
+                             'X-Auth-Token': token_id},
+                    body=key1_json
+                )
+            except:
+                log.info("catched exception!")
+                run_in_barbican_venv(ctx, cclient, ['sleep', '900'])
+
+            barbican_sec_resp = sec_req.getresponse()
+            if not (barbican_sec_resp.status >= 200 and
+                    barbican_sec_resp.status < 300):
+                raise Exception("Cannot create secret")
+            barbican_data = json.loads(barbican_sec_resp.read().decode())
+            if 'secret_ref' not in barbican_data:
+                raise ValueError("Malformed secret creation response")
+            secret_ref = barbican_data["secret_ref"]
+            log.info("secret_ref=%s", secret_ref)
+            secret_url_parsed = urlparse(secret_ref)
+            acl_json = json.dumps(
+                {
+                    "read": {
+                        "users": [rgw_user_id],
+                        "project-access": True
+                    }
+                })
+            acl_req = http.client.HTTPConnection(secret_url_parsed.netloc, timeout=30)
+            acl_req.request(
+                'PUT',
+                secret_url_parsed.path+'/acl',
+                headers={'Content-Type': 'application/json',
+                         'Accept': '*/*',
+                         'X-Auth-Token': token_id},
+                body=acl_json
+            )
+            barbican_acl_resp = acl_req.getresponse()
+            if not (barbican_acl_resp.status >= 200 and
+                    barbican_acl_resp.status < 300):
+                raise Exception("Cannot set ACL for secret")
+
+            key = {'id': secret_ref.split('secrets/')[1], 'payload': secret['base64']}
+            ctx.barbican.keys[secret['name']] = key
+
+    run_in_barbican_venv(ctx, cclient, ['sleep', '3'])
+    try:
+        yield
+    finally:
+        pass
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Deploy and configure Keystone
+
+    Example of configuration:
+
+    tasks:
+      - local_cluster:
+          cluster_path: /home/adam/ceph-1/build
+      - local_rgw:
+      - tox: [ client.0 ]
+      - keystone:
+          client.0:
+            sha1: 17.0.0.0rc2
+            force-branch: master
+            projects:
+              - name: rgwcrypt
+                description: Encryption Tenant
+              - name: barbican
+                description: Barbican
+              - name: s3
+                description: S3 project
+            users:
+              - name: rgwcrypt-user
+                password: rgwcrypt-pass
+                project: rgwcrypt
+              - name: barbican-user
+                password: barbican-pass
+                project: barbican
+              - name: s3-user
+                password: s3-pass
+                project: s3
+            roles: [ name: Member, name: creator ]
+            role-mappings:
+              - name: Member
+                user: rgwcrypt-user
+                project: rgwcrypt
+              - name: admin
+                user: barbican-user
+                project: barbican
+              - name: creator
+                user: s3-user
+                project: s3
+            services:
+              - name: keystone
+                type: identity
+                description: Keystone Identity Service
+      - barbican:
+          client.0:
+            force-branch: master
+            use-keystone-role: client.0
+            keystone_authtoken:
+              auth_plugin: password
+              username: barbican-user
+              password: barbican-pass
+              user_domain_name: Default
+            rgw_user:
+              tenantName: rgwcrypt
+              username: rgwcrypt-user
+              password: rgwcrypt-pass
+            secrets:
+              - name: my-key-1
+                base64: a2V5MS5GcWVxKzhzTGNLaGtzQkg5NGVpb1FKcFpGb2c=
+                tenantName: s3
+                username: s3-user
+                password: s3-pass
+              - name: my-key-2
+                base64: a2V5Mi5yNUNNMGFzMVdIUVZxcCt5NGVmVGlQQ1k4YWg=
+                tenantName: s3
+                username: s3-user
+                password: s3-pass
+      - s3tests:
+          client.0:
+            force-branch: master
+            kms_key: my-key-1
+      - rgw:
+          client.0:
+            use-keystone-role: client.0
+            use-barbican-role: client.0
+    """
+    assert config is None or isinstance(config, list) \
+        or isinstance(config, dict), \
+        "task keystone only supports a list or dictionary for configuration"
+    all_clients = ['client.{id}'.format(id=id_)
+                   for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    if config is None:
+        config = all_clients
+    if isinstance(config, list):
+        config = dict.fromkeys(config)
+
+    overrides = ctx.config.get('overrides', {})
+    # merge each client section, not the top level.
+    for client in config.keys():
+        if not config[client]:
+            config[client] = {}
+        teuthology.deep_merge(config[client], overrides.get('barbican', {}))
+
+    log.debug('Barbican config is %s', config)
+
+    if not hasattr(ctx, 'keystone'):
+        raise ConfigError('barbican must run after the keystone task')
+
+
+    ctx.barbican = argparse.Namespace()
+    ctx.barbican.endpoints = assign_ports(ctx, config, 9311)
+    ctx.barbican.keys = {}
+    
+    with contextutil.nested(
+        lambda: download(ctx=ctx, config=config),
+        lambda: setup_venv(ctx=ctx, config=config),
+        lambda: configure_barbican(ctx=ctx, config=config),
+        lambda: run_barbican(ctx=ctx, config=config),
+        lambda: create_secrets(ctx=ctx, config=config),
+        ):
+        yield
diff --git a/qa/tasks/blktrace.py b/qa/tasks/blktrace.py
new file mode 100644
index 000000000..10b1da0c0
--- /dev/null
+++ b/qa/tasks/blktrace.py
@@ -0,0 +1,96 @@
+"""
+Run blktrace program through teuthology
+"""
+import contextlib
+import logging
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+blktrace = '/usr/sbin/blktrace'
+daemon_signal = 'term'
+
+@contextlib.contextmanager
+def setup(ctx, config):
+    """
+    Setup all the remotes
+    """
+    osds = ctx.cluster.only(teuthology.is_type('osd', config['cluster']))
+    log_dir = '{tdir}/archive/performance/blktrace'.format(tdir=teuthology.get_testdir(ctx))
+
+    for remote, roles_for_host in osds.remotes.items():
+        log.info('Creating %s on %s' % (log_dir, remote.name))
+        remote.run(
+            args=['mkdir', '-p', '-m0755', '--', log_dir],
+            wait=False,
+            )
+    yield
+
+@contextlib.contextmanager
+def execute(ctx, config):
+    """
+    Run the blktrace program on remote machines.
+    """
+    procs = []
+    testdir = teuthology.get_testdir(ctx)
+    log_dir = '{tdir}/archive/performance/blktrace'.format(tdir=testdir)
+
+    osds = ctx.cluster.only(teuthology.is_type('osd'))
+    for remote, roles_for_host in osds.remotes.items():
+        roles_to_devs = ctx.disk_config.remote_to_roles_to_dev[remote]
+        for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd',
+                                                     config['cluster']):
+            if roles_to_devs.get(role):
+                dev = roles_to_devs[role]
+                log.info("running blktrace on %s: %s" % (remote.name, dev))
+
+                proc = remote.run(
+                    args=[
+                        'cd',
+                        log_dir,
+                        run.Raw(';'),
+                        'daemon-helper',
+                        daemon_signal,
+                        'sudo',
+                        blktrace,
+                        '-o',
+                        dev.rsplit("/", 1)[1],
+                        '-d',
+                        dev,
+                        ],
+                    wait=False,
+                    stdin=run.PIPE,
+                    )
+                procs.append(proc)
+    try:
+        yield
+    finally:
+        osds = ctx.cluster.only(teuthology.is_type('osd'))
+        log.info('stopping blktrace processs')
+        for proc in procs:
+            proc.stdin.close()
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Usage:
+        blktrace:
+
+    or:
+        blktrace:
+          cluster: backup
+
+    Runs blktrace on all osds in the specified cluster (the 'ceph' cluster by
+    default).
+    """
+    if config is None:
+        config = {}
+    config['cluster'] = config.get('cluster', 'ceph')
+
+    with contextutil.nested(
+        lambda: setup(ctx=ctx, config=config),
+        lambda: execute(ctx=ctx, config=config),
+        ):
+        yield
diff --git a/qa/tasks/boto.cfg.template b/qa/tasks/boto.cfg.template
new file mode 100644
index 000000000..cdfe8873b
--- /dev/null
+++ b/qa/tasks/boto.cfg.template
@@ -0,0 +1,2 @@
+[Boto]
+http_socket_timeout = {idle_timeout}
diff --git a/qa/tasks/cbt.py b/qa/tasks/cbt.py
new file mode 100644
index 000000000..56c57138b
--- /dev/null
+++ b/qa/tasks/cbt.py
@@ -0,0 +1,293 @@
+import logging
+import os
+import yaml
+
+from teuthology import misc
+from teuthology.orchestra import run
+from teuthology.task import Task
+
+log = logging.getLogger(__name__)
+
+
+class CBT(Task):
+    """
+    Passes through a CBT configuration yaml fragment.
+    """
+    def __init__(self, ctx, config):
+        super(CBT, self).__init__(ctx, config)
+        self.log = log
+
+    def hosts_of_type(self, type_):
+        return [r.name for r in self.ctx.cluster.only(misc.is_type(type_)).remotes.keys()]
+
+    def generate_cbt_config(self):
+        mon_hosts = self.hosts_of_type('mon')
+        osd_hosts = self.hosts_of_type('osd')
+        client_hosts = self.hosts_of_type('client')
+        rgw_client = {}
+        rgw_client[client_hosts[0]] = None
+        rgw_hosts = self.config.get('cluster', {}).get('rgws', rgw_client)
+        cluster_config = dict(
+            user=self.config.get('cluster', {}).get('user', 'ubuntu'),
+            head=mon_hosts[0],
+            osds=osd_hosts,
+            mons=mon_hosts,
+            clients=client_hosts,
+            rgws=rgw_hosts,
+            osds_per_node=self.config.get('cluster', {}).get('osds_per_node', 1),
+            rebuild_every_test=False,
+            use_existing=True,
+            is_teuthology=self.config.get('cluster', {}).get('is_teuthology', True),
+            iterations=self.config.get('cluster', {}).get('iterations', 1),
+            tmp_dir='/tmp/cbt',
+            pool_profiles=self.config.get('cluster', {}).get('pool_profiles'),
+            )
+
+        benchmark_config = self.config.get('benchmarks')
+        benchmark_type = next(iter(benchmark_config.keys()))
+        if benchmark_type in ['librbdfio', 'fio']:
+          testdir = misc.get_testdir(self.ctx)
+          benchmark_config[benchmark_type]['cmd_path'] = os.path.join(testdir, 'fio/fio')
+        if benchmark_type == 'cosbench':
+            # create cosbench_dir and cosbench_xml_dir
+            testdir = misc.get_testdir(self.ctx)
+            benchmark_config['cosbench']['cosbench_dir'] = os.path.join(testdir, 'cos')
+            benchmark_config['cosbench']['cosbench_xml_dir'] = os.path.join(testdir, 'xml')
+            self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', benchmark_config['cosbench']['cosbench_xml_dir']])
+            benchmark_config['cosbench']['controller'] = osd_hosts[0]
+
+            # set auth details
+            remotes_and_roles = self.ctx.cluster.remotes.items()
+            ips = [host for (host, port) in
+                   (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
+            benchmark_config['cosbench']['auth'] = "username=cosbench:operator;password=intel2012;url=http://%s:80/auth/v1.0;retry=9" %(ips[0])
+        client_endpoints_config = self.config.get('client_endpoints', None)
+
+        return dict(
+            cluster=cluster_config,
+            benchmarks=benchmark_config,
+            client_endpoints = client_endpoints_config,
+            )
+
+    def install_dependencies(self):
+        system_type = misc.get_system_type(self.first_mon)
+
+        if system_type == 'rpm':
+            install_cmd = ['sudo', 'yum', '-y', 'install']
+            cbt_depends = ['python3-yaml', 'python3-lxml', 'librbd-devel', 'pdsh', 'collectl']
+        else:
+            install_cmd = ['sudo', 'apt-get', '-y', '--force-yes', 'install']
+            cbt_depends = ['python3-yaml', 'python3-lxml', 'librbd-dev', 'collectl']
+        self.first_mon.run(args=install_cmd + cbt_depends)
+
+        benchmark_type = next(iter(self.cbt_config.get('benchmarks').keys()))
+        self.log.info('benchmark: %s', benchmark_type)
+
+        if benchmark_type in ['librbdfio', 'fio']:
+            # install fio
+            testdir = misc.get_testdir(self.ctx)
+            self.first_mon.run(
+                args=[
+                    'git', 'clone', '-b', 'master',
+                    'https://github.com/axboe/fio.git',
+                    '{tdir}/fio'.format(tdir=testdir)
+                ]
+            )
+            self.first_mon.run(
+                args=[
+                    'cd', os.path.join(testdir, 'fio'), run.Raw('&&'),
+                    './configure', run.Raw('&&'),
+                    'make'
+                ]
+            )
+
+        if benchmark_type == 'cosbench':
+            # install cosbench
+            self.log.info('install dependencies for cosbench')
+            if system_type == 'rpm':
+                cosbench_depends = ['wget', 'unzip', 'java-1.7.0-openjdk', 'curl']
+            else:
+                cosbench_depends = ['wget', 'unzip', 'openjdk-8-jre', 'curl']
+            self.first_mon.run(args=install_cmd + cosbench_depends)
+            testdir = misc.get_testdir(self.ctx)
+            cosbench_version = '0.4.2.c3'
+            cosbench_location = 'https://github.com/intel-cloud/cosbench/releases/download/v0.4.2.c3/0.4.2.c3.zip'
+            os_version = misc.get_system_type(self.first_mon, False, True)
+
+            # additional requirements for bionic
+            if os_version == '18.04':
+                self.first_mon.run(
+                    args=['sudo', 'apt-get', '-y', 'purge', 'openjdk-11*'])
+                # use our own version of cosbench
+                cosbench_version = 'cosbench-0.4.2.c3.1'
+                # contains additional parameter "-N" to nc
+                cosbench_location = 'http://drop.ceph.com/qa/cosbench-0.4.2.c3.1.zip'
+                cosbench_dir = os.path.join(testdir, cosbench_version)
+                self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', cosbench_dir])
+                self.first_mon.run(
+                    args=[
+                        'cd', testdir, run.Raw('&&'),
+                        'wget',
+                        cosbench_location, run.Raw('&&'),
+                        'unzip', '{name}.zip'.format(name=cosbench_version), '-d', cosbench_version
+                    ]
+                )
+            else:
+                self.first_mon.run(
+                    args=[
+                        'cd', testdir, run.Raw('&&'),
+                        'wget',
+                        cosbench_location, run.Raw('&&'),
+                        'unzip', '{name}.zip'.format(name=cosbench_version)
+                    ]
+                )
+            self.first_mon.run(
+                args=[
+                    'cd', testdir, run.Raw('&&'),
+                    'ln', '-s', cosbench_version, 'cos',
+                ]
+            )
+            self.first_mon.run(
+                args=[
+                    'cd', os.path.join(testdir, 'cos'), run.Raw('&&'),
+                    'chmod', '+x', run.Raw('*.sh'),
+                ]
+            )
+
+            # start cosbench and check info
+            self.log.info('start cosbench')
+            self.first_mon.run(
+                args=[
+                    'cd', testdir, run.Raw('&&'),
+                    'cd', 'cos', run.Raw('&&'),
+                    'sh', 'start-all.sh'
+                ]
+            )
+            self.log.info('check cosbench info')
+            self.first_mon.run(
+                args=[
+                    'cd', testdir, run.Raw('&&'),
+                    'cd', 'cos', run.Raw('&&'),
+                    'sh', 'cli.sh', 'info'
+                ]
+            )
+
+    def checkout_cbt(self):
+        testdir = misc.get_testdir(self.ctx)
+        repo = self.config.get('repo', 'https://github.com/ceph/cbt.git')
+        branch = self.config.get('branch', 'master')
+        branch = self.config.get('force-branch', branch)
+        sha1 = self.config.get('sha1')
+        if sha1 is None:
+            self.first_mon.run(
+                args=[
+                    'git', 'clone', '--depth', '1', '-b', branch, repo,
+                    '{tdir}/cbt'.format(tdir=testdir)
+                ]
+            )
+        else:
+            self.first_mon.run(
+                args=[
+                    'git', 'clone', '-b', branch, repo,
+                    '{tdir}/cbt'.format(tdir=testdir)
+                ]
+            )
+            self.first_mon.run(
+                args=[
+                    'cd', os.path.join(testdir, 'cbt'), run.Raw('&&'),
+                    'git', 'reset', '--hard', sha1,
+                ]
+            )
+
+    def setup(self):
+        super(CBT, self).setup()
+        self.first_mon = next(iter(self.ctx.cluster.only(misc.get_first_mon(self.ctx, self.config)).remotes.keys()))
+        self.cbt_config = self.generate_cbt_config()
+        self.log.info('cbt configuration is %s', self.cbt_config)
+        self.cbt_dir = os.path.join(misc.get_archive_dir(self.ctx), 'cbt')
+        self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', self.cbt_dir])
+        self.first_mon.write_file(
+                os.path.join(self.cbt_dir, 'cbt_config.yaml'),
+                yaml.safe_dump(self.cbt_config, default_flow_style=False))
+        self.checkout_cbt()
+        self.install_dependencies()
+
+    def begin(self):
+        super(CBT, self).begin()
+        testdir = misc.get_testdir(self.ctx)
+        self.first_mon.run(
+            args=[
+                '{tdir}/cbt/cbt.py'.format(tdir=testdir),
+                '-a', self.cbt_dir,
+                '{cbtdir}/cbt_config.yaml'.format(cbtdir=self.cbt_dir),
+            ],
+        )
+        preserve_file = os.path.join(self.ctx.archive, '.preserve')
+        open(preserve_file, 'a').close()
+
+    def end(self):
+        super(CBT, self).end()
+        testdir = misc.get_testdir(self.ctx)
+        self.first_mon.run(
+            args=[
+                'rm', '--one-file-system', '-rf', '--',
+                '{tdir}/cbt'.format(tdir=testdir),
+            ]
+        )
+        benchmark_type = next(iter(self.cbt_config.get('benchmarks').keys()))
+        if benchmark_type in ['librbdfio', 'fio']:
+            self.first_mon.run(
+                args=[
+                    'rm', '--one-file-system', '-rf', '--',
+                    '{tdir}/fio'.format(tdir=testdir),
+                ]
+            )
+
+        if benchmark_type == 'cosbench':
+            os_version = misc.get_system_type(self.first_mon, False, True)
+            if os_version == '18.04':
+                cosbench_version = 'cosbench-0.4.2.c3.1'
+            else:
+                cosbench_version = '0.4.2.c3'
+            # note: stop-all requires 'nc'
+            self.first_mon.run(
+                args=[
+                    'cd', testdir, run.Raw('&&'),
+                    'cd', 'cos', run.Raw('&&'),
+                    'sh', 'stop-all.sh',
+                    run.Raw('||'), 'true'
+                ]
+            )
+            self.first_mon.run(
+                args=[
+                    'sudo', 'killall', '-9', 'java',
+                    run.Raw('||'), 'true'
+                ]
+            )
+            self.first_mon.run(
+                args=[
+                    'rm', '--one-file-system', '-rf', '--',
+                    '{tdir}/cos'.format(tdir=testdir),
+                ]
+            )
+            self.first_mon.run(
+                args=[
+                    'rm', '--one-file-system', '-rf', '--',
+                    '{tdir}/{version}'.format(tdir=testdir, version=cosbench_version),
+                ]
+            )
+            self.first_mon.run(
+                args=[
+                    'rm', '--one-file-system', '-rf', '--',
+                    '{tdir}/{version}.zip'.format(tdir=testdir, version=cosbench_version),
+                ]
+            )
+            self.first_mon.run(
+                args=[
+                    'rm', '--one-file-system', '-rf', '--',
+                    '{tdir}/xml'.format(tdir=testdir),
+                ]
+            )
+
+
+task = CBT
diff --git a/qa/tasks/ceph.conf.template b/qa/tasks/ceph.conf.template
new file mode 100644
index 000000000..609a124b8
--- /dev/null
+++ b/qa/tasks/ceph.conf.template
@@ -0,0 +1,107 @@
+[global]
+	chdir = ""
+	pid file = /var/run/ceph/$cluster-$name.pid
+        auth supported = cephx
+
+	filestore xattr use omap = true
+
+	mon clock drift allowed = 1.000
+
+	osd crush chooseleaf type = 0
+        auth debug = true
+
+	ms die on old message = true
+	ms die on bug = true
+
+	mon max pg per osd = 10000        # >= luminous
+	mon pg warn max object skew = 0
+
+	# disable pg_autoscaler by default for new pools
+        osd_pool_default_pg_autoscale_mode = off
+
+	osd pool default size = 2
+
+	mon osd allow primary affinity = true
+	mon osd allow pg remap = true
+	mon warn on legacy crush tunables = false
+	mon warn on crush straw calc version zero = false
+	mon warn on no sortbitwise = false
+	mon warn on osd down out interval zero = false
+	mon warn on too few osds = false
+	mon_warn_on_pool_pg_num_not_power_of_two = false
+        mon_warn_on_pool_no_redundancy = false
+	mon_allow_pool_size_one = true
+
+        osd pool default erasure code profile = "plugin=jerasure technique=reed_sol_van k=2 m=1 ruleset-failure-domain=osd crush-failure-domain=osd"
+
+	osd default data pool replay window = 5
+
+	mon allow pool delete = true
+
+	mon cluster log file level = debug
+	debug asserts on shutdown = true
+	mon health detail to clog = false
+
+[osd]
+        osd journal size = 100
+
+        osd scrub load threshold = 5.0
+	osd scrub max interval = 600
+
+	osd recover clone overlap = true
+	osd recovery max chunk = 1048576
+
+	osd debug shutdown = true
+        osd debug op order = true
+        osd debug verify stray on activate = true
+
+	osd open classes on start = true
+        osd debug pg log writeout = true
+
+	osd deep scrub update digest min age = 30
+
+	osd map max advance = 10
+
+        journal zero on create = true
+
+	filestore ondisk finisher threads = 3
+	filestore apply finisher threads = 3
+
+	bdev debug aio = true
+	osd debug misdirected ops = true
+
+[mgr]
+	debug ms = 1
+	debug mgr = 20
+	debug mon = 20
+	debug auth = 20
+	mon reweight min pgs per osd = 4
+	mon reweight min bytes per osd = 10
+	mgr/telemetry/nag = false
+
+[mon]
+	debug ms = 1
+	debug mon = 20
+	debug paxos = 20
+	debug auth = 20
+	mon data avail warn = 5
+	mon mgr mkfs grace = 240
+	mon reweight min pgs per osd = 4
+	mon osd reporter subtree level = osd
+	mon osd prime pg temp = true
+	mon reweight min bytes per osd = 10
+
+	# rotate auth tickets quickly to exercise renewal paths
+	auth mon ticket ttl = 660      # 11m
+	auth service ticket ttl = 240  # 4m
+
+	# don't complain about insecure global_id in the test suite
+	mon_warn_on_insecure_global_id_reclaim = false
+	mon_warn_on_insecure_global_id_reclaim_allowed = false
+
+[client]
+	rgw cache enabled = true
+	rgw enable ops log = true
+	rgw enable usage log = true
+	log file = /var/log/ceph/$cluster-$name.$pid.log
+	admin socket = /var/run/ceph/$cluster-$name.$pid.asok
diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py
new file mode 100644
index 000000000..a6eab9be8
--- /dev/null
+++ b/qa/tasks/ceph.py
@@ -0,0 +1,1924 @@
+"""
+Ceph cluster task.
+
+Handle the setup, starting, and clean-up of a Ceph cluster.
+"""
+from copy import deepcopy
+from io import BytesIO
+from io import StringIO
+
+import argparse
+import configobj
+import contextlib
+import errno
+import logging
+import os
+import json
+import time
+import gevent
+import re
+import socket
+import yaml
+
+from paramiko import SSHException
+from tasks.ceph_manager import CephManager, write_conf, get_valgrind_args
+from tarfile import ReadError
+from tasks.cephfs.filesystem import MDSCluster, Filesystem
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology import exceptions
+from teuthology.orchestra import run
+from tasks import ceph_client as cclient
+from teuthology.orchestra.daemon import DaemonGroup
+from tasks.daemonwatchdog import DaemonWatchdog
+
+CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
+DATA_PATH = '/var/lib/ceph/{type_}/{cluster}-{id_}'
+
+log = logging.getLogger(__name__)
+
+
+def generate_caps(type_):
+    """
+    Each call will return the next capability for each system type
+    (essentially a subset of possible role values).  Valid types are osd,
+    mds and client.
+    """
+    defaults = dict(
+        osd=dict(
+            mon='allow profile osd',
+            mgr='allow profile osd',
+            osd='allow *',
+        ),
+        mgr=dict(
+            mon='allow profile mgr',
+            osd='allow *',
+            mds='allow *',
+        ),
+        mds=dict(
+            mon='allow *',
+            mgr='allow *',
+            osd='allow *',
+            mds='allow',
+        ),
+        client=dict(
+            mon='allow rw',
+            mgr='allow r',
+            osd='allow rwx',
+            mds='allow',
+        ),
+    )
+    for subsystem, capability in defaults[type_].items():
+        yield '--cap'
+        yield subsystem
+        yield capability
+
+
+def update_archive_setting(ctx, key, value):
+    """
+    Add logs directory to job's info log file
+    """
+    if ctx.archive is None:
+        return
+    with open(os.path.join(ctx.archive, 'info.yaml'), 'r+') as info_file:
+        info_yaml = yaml.safe_load(info_file)
+        info_file.seek(0)
+        if 'archive' in info_yaml:
+            info_yaml['archive'][key] = value
+        else:
+            info_yaml['archive'] = {key: value}
+        yaml.safe_dump(info_yaml, info_file, default_flow_style=False)
+
+
+@contextlib.contextmanager
+def ceph_crash(ctx, config):
+    """
+    Gather crash dumps from /var/lib/ceph/crash
+    """
+
+    # Add crash directory to job's archive
+    update_archive_setting(ctx, 'crash', '/var/lib/ceph/crash')
+
+    try:
+        yield
+
+    finally:
+        if ctx.archive is not None:
+            log.info('Archiving crash dumps...')
+            path = os.path.join(ctx.archive, 'remote')
+            try:
+                os.makedirs(path)
+            except OSError:
+                pass
+            for remote in ctx.cluster.remotes.keys():
+                sub = os.path.join(path, remote.shortname)
+                try:
+                    os.makedirs(sub)
+                except OSError:
+                    pass
+                try:
+                    teuthology.pull_directory(remote, '/var/lib/ceph/crash',
+                                              os.path.join(sub, 'crash'))
+                except ReadError:
+                    pass
+
+
+@contextlib.contextmanager
+def ceph_log(ctx, config):
+    """
+    Create /var/log/ceph log directory that is open to everyone.
+    Add valgrind and profiling-logger directories.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    log.info('Making ceph log dir writeable by non-root...')
+    run.wait(
+        ctx.cluster.run(
+            args=[
+                'sudo',
+                'chmod',
+                '777',
+                '/var/log/ceph',
+            ],
+            wait=False,
+        )
+    )
+    log.info('Disabling ceph logrotate...')
+    run.wait(
+        ctx.cluster.run(
+            args=[
+                'sudo',
+                'rm', '-f', '--',
+                '/etc/logrotate.d/ceph',
+            ],
+            wait=False,
+        )
+    )
+    log.info('Creating extra log directories...')
+    run.wait(
+        ctx.cluster.run(
+            args=[
+                'sudo',
+                'install', '-d', '-m0777', '--',
+                '/var/log/ceph/valgrind',
+                '/var/log/ceph/profiling-logger',
+            ],
+            wait=False,
+        )
+    )
+
+    # Add logs directory to job's info log file
+    update_archive_setting(ctx, 'log', '/var/log/ceph')
+
+    class Rotater(object):
+        stop_event = gevent.event.Event()
+
+        def invoke_logrotate(self):
+            # 1) install ceph-test.conf in /etc/logrotate.d
+            # 2) continuously loop over logrotate invocation with ceph-test.conf
+            while not self.stop_event.is_set():
+                self.stop_event.wait(timeout=30)
+                try:
+                    procs = ctx.cluster.run(
+                          args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'],
+                          wait=False,
+                          stderr=StringIO()
+                    )
+                    run.wait(procs)
+                except exceptions.ConnectionLostError as e:
+                    # Some tests may power off nodes during test, in which
+                    # case we will see connection errors that we should ignore.
+                    log.debug("Missed logrotate, node '{0}' is offline".format(
+                        e.node))
+                except EOFError:
+                    # Paramiko sometimes raises this when it fails to
+                    # connect to a node during open_session.  As with
+                    # ConnectionLostError, we ignore this because nodes
+                    # are allowed to get power cycled during tests.
+                    log.debug("Missed logrotate, EOFError")
+                except SSHException:
+                    log.debug("Missed logrotate, SSHException")
+                except run.CommandFailedError as e:
+                    for p in procs:
+                        if p.finished and p.exitstatus != 0:
+                            err = p.stderr.getvalue()
+                            if 'error: error renaming temp state file' in err:
+                                log.info('ignoring transient state error: %s', e)
+                            else:
+                                raise
+                except socket.error as e:
+                    if e.errno in (errno.EHOSTUNREACH, errno.ECONNRESET):
+                        log.debug("Missed logrotate, host unreachable")
+                    else:
+                        raise
+
+        def begin(self):
+            self.thread = gevent.spawn(self.invoke_logrotate)
+
+        def end(self):
+            self.stop_event.set()
+            self.thread.get()
+
+    def write_rotate_conf(ctx, daemons):
+        testdir = teuthology.get_testdir(ctx)
+        remote_logrotate_conf = '%s/logrotate.ceph-test.conf' % testdir
+        rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
+        with open(rotate_conf_path) as f:
+            conf = ""
+            for daemon, size in daemons.items():
+                log.info('writing logrotate stanza for {}'.format(daemon))
+                conf += f.read().format(daemon_type=daemon,
+                                        max_size=size)
+                f.seek(0, 0)
+
+            for remote in ctx.cluster.remotes.keys():
+                remote.write_file(remote_logrotate_conf, BytesIO(conf.encode()))
+                remote.sh(
+                    f'sudo mv {remote_logrotate_conf} /etc/logrotate.d/ceph-test.conf && '
+                    'sudo chmod 0644 /etc/logrotate.d/ceph-test.conf && '
+                    'sudo chown root.root /etc/logrotate.d/ceph-test.conf')
+                remote.chcon('/etc/logrotate.d/ceph-test.conf',
+                             'system_u:object_r:etc_t:s0')
+
+    if ctx.config.get('log-rotate'):
+        daemons = ctx.config.get('log-rotate')
+        log.info('Setting up log rotation with ' + str(daemons))
+        write_rotate_conf(ctx, daemons)
+        logrotater = Rotater()
+        logrotater.begin()
+    try:
+        yield
+
+    finally:
+        if ctx.config.get('log-rotate'):
+            log.info('Shutting down logrotate')
+            logrotater.end()
+            ctx.cluster.sh('sudo rm /etc/logrotate.d/ceph-test.conf')
+        if ctx.archive is not None and \
+                not (ctx.config.get('archive-on-error') and ctx.summary['success']):
+            # and logs
+            log.info('Compressing logs...')
+            run.wait(
+                ctx.cluster.run(
+                    args=[
+                        'sudo',
+                        'find',
+                        '/var/log/ceph',
+                        '-name',
+                        '*.log',
+                        '-print0',
+                        run.Raw('|'),
+                        'sudo',
+                        'xargs',
+                        '-0',
+                        '--no-run-if-empty',
+                        '--',
+                        'gzip',
+                        '--',
+                    ],
+                    wait=False,
+                ),
+            )
+
+            log.info('Archiving logs...')
+            path = os.path.join(ctx.archive, 'remote')
+            try:
+                os.makedirs(path)
+            except OSError:
+                pass
+            for remote in ctx.cluster.remotes.keys():
+                sub = os.path.join(path, remote.shortname)
+                try:
+                    os.makedirs(sub)
+                except OSError:
+                    pass
+                teuthology.pull_directory(remote, '/var/log/ceph',
+                                          os.path.join(sub, 'log'))
+
+
+def assign_devs(roles, devs):
+    """
+    Create a dictionary of devs indexed by roles
+
+    :param roles: List of roles
+    :param devs: Corresponding list of devices.
+    :returns: Dictionary of devs indexed by roles.
+    """
+    return dict(zip(roles, devs))
+
+
+@contextlib.contextmanager
+def valgrind_post(ctx, config):
+    """
+    After the tests run, look through all the valgrind logs.  Exceptions are raised
+    if textual errors occurred in the logs, or if valgrind exceptions were detected in
+    the logs.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    try:
+        yield
+    finally:
+        lookup_procs = list()
+        log.info('Checking for errors in any valgrind logs...')
+        for remote in ctx.cluster.remotes.keys():
+            # look at valgrind logs for each node
+            proc = remote.run(
+                args="sudo zgrep '<kind>' /var/log/ceph/valgrind/* "
+                     # include a second file so that we always get
+                     # a filename prefix on the output
+                     "/dev/null | sort | uniq",
+                wait=False,
+                check_status=False,
+                stdout=StringIO(),
+            )
+            lookup_procs.append((proc, remote))
+
+        valgrind_exception = None
+        for (proc, remote) in lookup_procs:
+            proc.wait()
+            out = proc.stdout.getvalue()
+            for line in out.split('\n'):
+                if line == '':
+                    continue
+                try:
+                    (file, kind) = line.split(':')
+                except Exception:
+                    log.error('failed to split line %s', line)
+                    raise
+                log.debug('file %s kind %s', file, kind)
+                if (file.find('mds') >= 0) and kind.find('Lost') > 0:
+                    continue
+                log.error('saw valgrind issue %s in %s', kind, file)
+                valgrind_exception = Exception('saw valgrind issues')
+
+        if config.get('expect_valgrind_errors'):
+            if not valgrind_exception:
+                raise Exception('expected valgrind issues and found none')
+        else:
+            if valgrind_exception:
+                raise valgrind_exception
+
+
+@contextlib.contextmanager
+def crush_setup(ctx, config):
+    cluster_name = config['cluster']
+    first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
+    (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
+
+    profile = config.get('crush_tunables', 'default')
+    log.info('Setting crush tunables to %s', profile)
+    mon_remote.run(
+        args=['sudo', 'ceph', '--cluster', cluster_name,
+              'osd', 'crush', 'tunables', profile])
+    yield
+
+
+@contextlib.contextmanager
+def setup_manager(ctx, config):
+    first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+    if not hasattr(ctx, 'managers'):
+        ctx.managers = {}
+    ctx.managers[config['cluster']] = CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager.' + config['cluster']),
+        cluster=config['cluster'],
+    )
+    yield
+
+@contextlib.contextmanager
+def create_rbd_pool(ctx, config):
+    cluster_name = config['cluster']
+    first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
+    (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
+    log.info('Waiting for OSDs to come up')
+    teuthology.wait_until_osds_up(
+        ctx,
+        cluster=ctx.cluster,
+        remote=mon_remote,
+        ceph_cluster=cluster_name,
+    )
+    if config.get('create_rbd_pool', True):
+        log.info('Creating RBD pool')
+        mon_remote.run(
+            args=['sudo', 'ceph', '--cluster', cluster_name,
+                  'osd', 'pool', 'create', 'rbd', '8'])
+        mon_remote.run(
+            args=[
+                'sudo', 'ceph', '--cluster', cluster_name,
+                'osd', 'pool', 'application', 'enable',
+                'rbd', 'rbd', '--yes-i-really-mean-it'
+            ],
+            check_status=False)
+    yield
+
+@contextlib.contextmanager
+def cephfs_setup(ctx, config):
+    cluster_name = config['cluster']
+
+    first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
+    (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
+    mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
+    # If there are any MDSs, then create a filesystem for them to use
+    # Do this last because requires mon cluster to be up and running
+    if mdss.remotes:
+        log.info('Setting up CephFS filesystem(s)...')
+        cephfs_config = config.get('cephfs', {})
+        fs_configs =  cephfs_config.pop('fs', [{'name': 'cephfs'}])
+        set_allow_multifs = len(fs_configs) > 1
+
+        # wait for standbys to become available (slow due to valgrind, perhaps)
+        mdsc = MDSCluster(ctx)
+        mds_count = len(list(teuthology.all_roles_of_type(ctx.cluster, 'mds')))
+        with contextutil.safe_while(sleep=2,tries=150) as proceed:
+            while proceed():
+                if len(mdsc.get_standby_daemons()) >= mds_count:
+                    break
+
+        fss = []
+        for fs_config in fs_configs:
+            assert isinstance(fs_config, dict)
+            name = fs_config.pop('name')
+            temp = deepcopy(cephfs_config)
+            teuthology.deep_merge(temp, fs_config)
+            fs = Filesystem(ctx, fs_config=temp, name=name, create=True)
+            if set_allow_multifs:
+                fs.set_allow_multifs()
+                set_allow_multifs = False
+            fss.append(fs)
+
+        yield
+
+        for fs in fss:
+            fs.destroy()
+    else:
+        yield
+
+@contextlib.contextmanager
+def watchdog_setup(ctx, config):
+    ctx.ceph[config['cluster']].thrashers = []
+    ctx.ceph[config['cluster']].watchdog = DaemonWatchdog(ctx, config, ctx.ceph[config['cluster']].thrashers)
+    ctx.ceph[config['cluster']].watchdog.start()
+    yield
+
+def get_mons(roles, ips, cluster_name,
+             mon_bind_msgr2=False,
+             mon_bind_addrvec=False):
+    """
+    Get monitors and their associated addresses
+    """
+    mons = {}
+    v1_ports = {}
+    v2_ports = {}
+    is_mon = teuthology.is_type('mon', cluster_name)
+    for idx, roles in enumerate(roles):
+        for role in roles:
+            if not is_mon(role):
+                continue
+            if ips[idx] not in v1_ports:
+                v1_ports[ips[idx]] = 6789
+            else:
+                v1_ports[ips[idx]] += 1
+            if mon_bind_msgr2:
+                if ips[idx] not in v2_ports:
+                    v2_ports[ips[idx]] = 3300
+                    addr = '{ip}'.format(ip=ips[idx])
+                else:
+                    assert mon_bind_addrvec
+                    v2_ports[ips[idx]] += 1
+                    addr = '[v2:{ip}:{port2},v1:{ip}:{port1}]'.format(
+                        ip=ips[idx],
+                        port2=v2_ports[ips[idx]],
+                        port1=v1_ports[ips[idx]],
+                    )
+            elif mon_bind_addrvec:
+                addr = '[v1:{ip}:{port}]'.format(
+                    ip=ips[idx],
+                    port=v1_ports[ips[idx]],
+                )
+            else:
+                addr = '{ip}:{port}'.format(
+                    ip=ips[idx],
+                    port=v1_ports[ips[idx]],
+                )
+            mons[role] = addr
+    assert mons
+    return mons
+
+def skeleton_config(ctx, roles, ips, mons, cluster='ceph'):
+    """
+    Returns a ConfigObj that is prefilled with a skeleton config.
+
+    Use conf[section][key]=value or conf.merge to change it.
+
+    Use conf.write to write it out, override .filename first if you want.
+    """
+    path = os.path.join(os.path.dirname(__file__), 'ceph.conf.template')
+    conf = configobj.ConfigObj(path, file_error=True)
+    mon_hosts = []
+    for role, addr in mons.items():
+        mon_cluster, _, _ = teuthology.split_role(role)
+        if mon_cluster != cluster:
+            continue
+        name = teuthology.ceph_role(role)
+        conf.setdefault(name, {})
+        mon_hosts.append(addr)
+    conf.setdefault('global', {})
+    conf['global']['mon host'] = ','.join(mon_hosts)
+    # set up standby mds's
+    is_mds = teuthology.is_type('mds', cluster)
+    for roles_subset in roles:
+        for role in roles_subset:
+            if is_mds(role):
+                name = teuthology.ceph_role(role)
+                conf.setdefault(name, {})
+    return conf
+
+def create_simple_monmap(ctx, remote, conf, mons,
+                         path=None,
+                         mon_bind_addrvec=False):
+    """
+    Writes a simple monmap based on current ceph.conf into path, or
+    <testdir>/monmap by default.
+
+    Assumes ceph_conf is up to date.
+
+    Assumes mon sections are named "mon.*", with the dot.
+
+    :return the FSID (as a string) of the newly created monmap
+    """
+
+    addresses = list(mons.items())
+    assert addresses, "There are no monitors in config!"
+    log.debug('Ceph mon addresses: %s', addresses)
+
+    try:
+        log.debug('writing out conf {c}'.format(c=conf))
+    except:
+        log.debug('my conf logging attempt failed')
+    testdir = teuthology.get_testdir(ctx)
+    tmp_conf_path = '{tdir}/ceph.tmp.conf'.format(tdir=testdir)
+    conf_fp = BytesIO()
+    conf.write(conf_fp)
+    conf_fp.seek(0)
+    teuthology.write_file(remote, tmp_conf_path, conf_fp)
+    args = [
+        'adjust-ulimits',
+        'ceph-coverage',
+        '{tdir}/archive/coverage'.format(tdir=testdir),
+        'monmaptool',
+        '-c',
+        '{conf}'.format(conf=tmp_conf_path),
+        '--create',
+        '--clobber',
+    ]
+    if mon_bind_addrvec:
+        args.extend(['--enable-all-features'])
+    for (role, addr) in addresses:
+        _, _, n = teuthology.split_role(role)
+        if mon_bind_addrvec and (',' in addr or 'v' in addr or ':' in addr):
+            args.extend(('--addv', n, addr))
+        else:
+            args.extend(('--add', n, addr))
+    if not path:
+        path = '{tdir}/monmap'.format(tdir=testdir)
+    args.extend([
+        '--print',
+        path
+    ])
+
+    monmap_output = remote.sh(args)
+    fsid = re.search("generated fsid (.+)$",
+                     monmap_output, re.MULTILINE).group(1)
+    teuthology.delete_file(remote, tmp_conf_path)
+    return fsid
+
+
+def maybe_redirect_stderr(config, type_, args, log_path):
+    if type_ == 'osd' and \
+       config.get('flavor', 'default') == 'crimson':
+        # teuthworker uses ubuntu:ubuntu to access the test nodes
+        create_log_cmd = \
+            f'sudo install -b -o ubuntu -g ubuntu /dev/null {log_path}'
+        return create_log_cmd, args + [run.Raw('2>>'), log_path]
+    else:
+        return None, args
+
+
+@contextlib.contextmanager
+def cluster(ctx, config):
+    """
+    Handle the creation and removal of a ceph cluster.
+
+    On startup:
+        Create directories needed for the cluster.
+        Create remote journals for all osds.
+        Create and set keyring.
+        Copy the monmap to the test systems.
+        Setup mon nodes.
+        Setup mds nodes.
+        Mkfs osd nodes.
+        Add keyring information to monmaps
+        Mkfs mon nodes.
+
+    On exit:
+        If errors occurred, extract a failure message and store in ctx.summary.
+        Unmount all test files and temporary journaling files.
+        Save the monitor information and archive all ceph logs.
+        Cleanup the keyring setup, and remove all monitor map and data files left over.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    if ctx.config.get('use_existing_cluster', False) is True:
+        log.info("'use_existing_cluster' is true; skipping cluster creation")
+        yield
+
+    testdir = teuthology.get_testdir(ctx)
+    cluster_name = config['cluster']
+    data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
+    log.info('Creating ceph cluster %s...', cluster_name)
+    log.info('config %s', config)
+    log.info('ctx.config %s', ctx.config)
+    run.wait(
+        ctx.cluster.run(
+            args=[
+                'install', '-d', '-m0755', '--',
+                data_dir,
+            ],
+            wait=False,
+        )
+    )
+
+    run.wait(
+        ctx.cluster.run(
+            args=[
+                'sudo',
+                'install', '-d', '-m0777', '--', '/var/run/ceph',
+            ],
+            wait=False,
+        )
+    )
+
+    devs_to_clean = {}
+    remote_to_roles_to_devs = {}
+    osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
+    for remote, roles_for_host in osds.remotes.items():
+        devs = teuthology.get_scratch_devices(remote)
+        roles_to_devs = assign_devs(
+            teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), devs
+        )
+        devs_to_clean[remote] = []
+        log.info('osd dev map: {}'.format(roles_to_devs))
+        assert roles_to_devs, \
+            "remote {} has osd roles, but no osd devices were specified!".format(remote.hostname)
+        remote_to_roles_to_devs[remote] = roles_to_devs
+    log.info("remote_to_roles_to_devs: {}".format(remote_to_roles_to_devs))
+    for osd_role, dev_name in remote_to_roles_to_devs.items():
+        assert dev_name, "{} has no associated device!".format(osd_role)
+
+    log.info('Generating config...')
+    remotes_and_roles = ctx.cluster.remotes.items()
+    roles = [role_list for (remote, role_list) in remotes_and_roles]
+    ips = [host for (host, port) in
+           (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
+    mons = get_mons(
+        roles, ips, cluster_name,
+        mon_bind_msgr2=config.get('mon_bind_msgr2'),
+        mon_bind_addrvec=config.get('mon_bind_addrvec'),
+        )
+    conf = skeleton_config(
+        ctx, roles=roles, ips=ips, mons=mons, cluster=cluster_name,
+    )
+    for section, keys in config['conf'].items():
+        for key, value in keys.items():
+            log.info("[%s] %s = %s" % (section, key, value))
+            if section not in conf:
+                conf[section] = {}
+            conf[section][key] = value
+
+    if not hasattr(ctx, 'ceph'):
+        ctx.ceph = {}
+    ctx.ceph[cluster_name] = argparse.Namespace()
+    ctx.ceph[cluster_name].conf = conf
+    ctx.ceph[cluster_name].mons = mons
+
+    default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
+    keyring_path = config.get('keyring_path', default_keyring)
+
+    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
+
+    firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
+
+    log.info('Setting up %s...' % firstmon)
+    ctx.cluster.only(firstmon).run(
+        args=[
+            'sudo',
+            'adjust-ulimits',
+            'ceph-coverage',
+            coverage_dir,
+            'ceph-authtool',
+            '--create-keyring',
+            keyring_path,
+        ],
+    )
+    ctx.cluster.only(firstmon).run(
+        args=[
+            'sudo',
+            'adjust-ulimits',
+            'ceph-coverage',
+            coverage_dir,
+            'ceph-authtool',
+            '--gen-key',
+            '--name=mon.',
+            keyring_path,
+        ],
+    )
+    ctx.cluster.only(firstmon).run(
+        args=[
+            'sudo',
+            'chmod',
+            '0644',
+            keyring_path,
+        ],
+    )
+    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
+    monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
+                                                   cluster=cluster_name)
+    fsid = create_simple_monmap(
+        ctx,
+        remote=mon0_remote,
+        conf=conf,
+        mons=mons,
+        path=monmap_path,
+        mon_bind_addrvec=config.get('mon_bind_addrvec'),
+    )
+    ctx.ceph[cluster_name].fsid = fsid
+    if not 'global' in conf:
+        conf['global'] = {}
+    conf['global']['fsid'] = fsid
+
+    default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
+    conf_path = config.get('conf_path', default_conf_path)
+    log.info('Writing %s for FSID %s...' % (conf_path, fsid))
+    write_conf(ctx, conf_path, cluster_name)
+
+    log.info('Creating admin key on %s...' % firstmon)
+    ctx.cluster.only(firstmon).run(
+        args=[
+            'sudo',
+            'adjust-ulimits',
+            'ceph-coverage',
+            coverage_dir,
+            'ceph-authtool',
+            '--gen-key',
+            '--name=client.admin',
+            '--cap', 'mon', 'allow *',
+            '--cap', 'osd', 'allow *',
+            '--cap', 'mds', 'allow *',
+            '--cap', 'mgr', 'allow *',
+            keyring_path,
+        ],
+    )
+
+    log.info('Copying monmap to all nodes...')
+    keyring = mon0_remote.read_file(keyring_path)
+    monmap = mon0_remote.read_file(monmap_path)
+
+    for rem in ctx.cluster.remotes.keys():
+        # copy mon key and initial monmap
+        log.info('Sending monmap to node {remote}'.format(remote=rem))
+        rem.write_file(keyring_path, keyring, mode='0644', sudo=True)
+        rem.write_file(monmap_path, monmap)
+
+    log.info('Setting up mon nodes...')
+    mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
+
+    if not config.get('skip_mgr_daemons', False):
+        log.info('Setting up mgr nodes...')
+        mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
+        for remote, roles_for_host in mgrs.remotes.items():
+            for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
+                                                         cluster_name):
+                _, _, id_ = teuthology.split_role(role)
+                mgr_dir = DATA_PATH.format(
+                    type_='mgr', cluster=cluster_name, id_=id_)
+                remote.run(
+                    args=[
+                        'sudo',
+                        'mkdir',
+                        '-p',
+                        mgr_dir,
+                        run.Raw('&&'),
+                        'sudo',
+                        'adjust-ulimits',
+                        'ceph-coverage',
+                        coverage_dir,
+                        'ceph-authtool',
+                        '--create-keyring',
+                        '--gen-key',
+                        '--name=mgr.{id}'.format(id=id_),
+                        mgr_dir + '/keyring',
+                    ],
+                )
+
+    log.info('Setting up mds nodes...')
+    mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
+    for remote, roles_for_host in mdss.remotes.items():
+        for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
+                                                     cluster_name):
+            _, _, id_ = teuthology.split_role(role)
+            mds_dir = DATA_PATH.format(
+                type_='mds', cluster=cluster_name, id_=id_)
+            remote.run(
+                args=[
+                    'sudo',
+                    'mkdir',
+                    '-p',
+                    mds_dir,
+                    run.Raw('&&'),
+                    'sudo',
+                    'adjust-ulimits',
+                    'ceph-coverage',
+                    coverage_dir,
+                    'ceph-authtool',
+                    '--create-keyring',
+                    '--gen-key',
+                    '--name=mds.{id}'.format(id=id_),
+                    mds_dir + '/keyring',
+                ],
+            )
+            remote.run(args=[
+                'sudo', 'chown', '-R', 'ceph:ceph', mds_dir
+            ])
+
+    cclient.create_keyring(ctx, cluster_name)
+    log.info('Running mkfs on osd nodes...')
+
+    if not hasattr(ctx, 'disk_config'):
+        ctx.disk_config = argparse.Namespace()
+    if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
+        ctx.disk_config.remote_to_roles_to_dev = {}
+    if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
+        ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
+    if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
+        ctx.disk_config.remote_to_roles_to_dev_fstype = {}
+
+    teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
+
+    log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
+    for remote, roles_for_host in osds.remotes.items():
+        roles_to_devs = remote_to_roles_to_devs[remote]
+
+        for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
+            _, _, id_ = teuthology.split_role(role)
+            mnt_point = DATA_PATH.format(
+                type_='osd', cluster=cluster_name, id_=id_)
+            remote.run(
+                args=[
+                    'sudo',
+                    'mkdir',
+                    '-p',
+                    mnt_point,
+                ])
+            log.info('roles_to_devs: {}'.format(roles_to_devs))
+            log.info('role: {}'.format(role))
+            if roles_to_devs.get(role):
+                dev = roles_to_devs[role]
+                fs = config.get('fs')
+                package = None
+                mkfs_options = config.get('mkfs_options')
+                mount_options = config.get('mount_options')
+                if fs == 'btrfs':
+                    # package = 'btrfs-tools'
+                    if mount_options is None:
+                        mount_options = ['noatime', 'user_subvol_rm_allowed']
+                    if mkfs_options is None:
+                        mkfs_options = ['-m', 'single',
+                                        '-l', '32768',
+                                        '-n', '32768']
+                if fs == 'xfs':
+                    # package = 'xfsprogs'
+                    if mount_options is None:
+                        mount_options = ['noatime']
+                    if mkfs_options is None:
+                        mkfs_options = ['-f', '-i', 'size=2048']
+                if fs == 'ext4' or fs == 'ext3':
+                    if mount_options is None:
+                        mount_options = ['noatime', 'user_xattr']
+
+                if mount_options is None:
+                    mount_options = []
+                if mkfs_options is None:
+                    mkfs_options = []
+                mkfs = ['mkfs.%s' % fs] + mkfs_options
+                log.info('%s on %s on %s' % (mkfs, dev, remote))
+                if package is not None:
+                    remote.sh('sudo apt-get install -y %s' % package)
+
+                try:
+                    remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
+                except run.CommandFailedError:
+                    # Newer btfs-tools doesn't prompt for overwrite, use -f
+                    if '-f' not in mount_options:
+                        mkfs_options.append('-f')
+                        mkfs = ['mkfs.%s' % fs] + mkfs_options
+                        log.info('%s on %s on %s' % (mkfs, dev, remote))
+                    remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
+
+                log.info('mount %s on %s -o %s' % (dev, remote,
+                                                   ','.join(mount_options)))
+                remote.run(
+                    args=[
+                        'sudo',
+                        'mount',
+                        '-t', fs,
+                        '-o', ','.join(mount_options),
+                        dev,
+                        mnt_point,
+                    ]
+                )
+                remote.run(
+                    args=[
+                        'sudo', '/sbin/restorecon', mnt_point,
+                    ],
+                    check_status=False,
+                )
+                if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
+                    ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
+                ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
+                if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
+                    ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
+                ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
+                devs_to_clean[remote].append(mnt_point)
+
+        for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
+            _, _, id_ = teuthology.split_role(role)
+            try:
+                args = ['sudo',
+                        'MALLOC_CHECK_=3',
+                        'adjust-ulimits',
+                        'ceph-coverage', coverage_dir,
+                        'ceph-osd',
+                        '--no-mon-config',
+                        '--cluster', cluster_name,
+                        '--mkfs',
+                        '--mkkey',
+                        '-i', id_,
+                        '--monmap', monmap_path]
+                log_path = f'/var/log/ceph/{cluster_name}-osd.{id_}.log'
+                create_log_cmd, args = \
+                    maybe_redirect_stderr(config, 'osd', args, log_path)
+                if create_log_cmd:
+                    remote.sh(create_log_cmd)
+                remote.run(args=args)
+            except run.CommandFailedError:
+                # try without --no-mon-config.. this may be an upgrade test
+                remote.run(
+                    args=[
+                        'sudo',
+                        'MALLOC_CHECK_=3',
+                        'adjust-ulimits',
+                        'ceph-coverage',
+                        coverage_dir,
+                        'ceph-osd',
+                        '--cluster',
+                        cluster_name,
+                        '--mkfs',
+                        '--mkkey',
+                        '-i', id_,
+                    '--monmap', monmap_path,
+                    ],
+                )
+            mnt_point = DATA_PATH.format(
+                type_='osd', cluster=cluster_name, id_=id_)
+            remote.run(args=[
+                'sudo', 'chown', '-R', 'ceph:ceph', mnt_point
+            ])
+
+    log.info('Reading keys from all nodes...')
+    keys_fp = BytesIO()
+    keys = []
+    for remote, roles_for_host in ctx.cluster.remotes.items():
+        for type_ in ['mgr',  'mds', 'osd']:
+            if type_ == 'mgr' and config.get('skip_mgr_daemons', False):
+                continue
+            for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
+                _, _, id_ = teuthology.split_role(role)
+                data = remote.read_file(
+                    os.path.join(
+                        DATA_PATH.format(
+                            type_=type_, id_=id_, cluster=cluster_name),
+                        'keyring',
+                    ),
+                    sudo=True,
+                )
+                keys.append((type_, id_, data))
+                keys_fp.write(data)
+    for remote, roles_for_host in ctx.cluster.remotes.items():
+        for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
+            _, _, id_ = teuthology.split_role(role)
+            data = remote.read_file(
+                '/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
+            )
+            keys.append(('client', id_, data))
+            keys_fp.write(data)
+
+    log.info('Adding keys to all mons...')
+    writes = mons.run(
+        args=[
+            'sudo', 'tee', '-a',
+            keyring_path,
+        ],
+        stdin=run.PIPE,
+        wait=False,
+        stdout=BytesIO(),
+    )
+    keys_fp.seek(0)
+    teuthology.feed_many_stdins_and_close(keys_fp, writes)
+    run.wait(writes)
+    for type_, id_, data in keys:
+        run.wait(
+            mons.run(
+                args=[
+                         'sudo',
+                         'adjust-ulimits',
+                         'ceph-coverage',
+                         coverage_dir,
+                         'ceph-authtool',
+                         keyring_path,
+                         '--name={type}.{id}'.format(
+                             type=type_,
+                             id=id_,
+                         ),
+                     ] + list(generate_caps(type_)),
+                wait=False,
+            ),
+        )
+
+    log.info('Running mkfs on mon nodes...')
+    for remote, roles_for_host in mons.remotes.items():
+        for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
+            _, _, id_ = teuthology.split_role(role)
+            mnt_point = DATA_PATH.format(
+                type_='mon', id_=id_, cluster=cluster_name)
+            remote.run(
+                args=[
+                    'sudo',
+                    'mkdir',
+                    '-p',
+                    mnt_point,
+                ],
+            )
+            remote.run(
+                args=[
+                    'sudo',
+                    'adjust-ulimits',
+                    'ceph-coverage',
+                    coverage_dir,
+                    'ceph-mon',
+                    '--cluster', cluster_name,
+                    '--mkfs',
+                    '-i', id_,
+                    '--monmap', monmap_path,
+                    '--keyring', keyring_path,
+                ],
+            )
+            remote.run(args=[
+                'sudo', 'chown', '-R', 'ceph:ceph', mnt_point
+            ])
+
+    run.wait(
+        mons.run(
+            args=[
+                'rm',
+                '--',
+                monmap_path,
+            ],
+            wait=False,
+        ),
+    )
+
+    try:
+        yield
+    except Exception:
+        # we need to know this below
+        ctx.summary['success'] = False
+        raise
+    finally:
+        (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
+
+        log.info('Checking cluster log for badness...')
+
+        def first_in_ceph_log(pattern, excludes):
+            """
+            Find the first occurrence of the pattern specified in the Ceph log,
+            Returns None if none found.
+
+            :param pattern: Pattern scanned for.
+            :param excludes: Patterns to ignore.
+            :return: First line of text (or None if not found)
+            """
+            args = [
+                'sudo',
+                'egrep', pattern,
+                '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
+            ]
+            for exclude in excludes:
+                args.extend([run.Raw('|'), 'egrep', '-v', exclude])
+            args.extend([
+                run.Raw('|'), 'head', '-n', '1',
+            ])
+            stdout = mon0_remote.sh(args)
+            return stdout or None
+
+        if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
+                             config['log_ignorelist']) is not None:
+            log.warning('Found errors (ERR|WRN|SEC) in cluster log')
+            ctx.summary['success'] = False
+            # use the most severe problem as the failure reason
+            if 'failure_reason' not in ctx.summary:
+                for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
+                    match = first_in_ceph_log(pattern, config['log_ignorelist'])
+                    if match is not None:
+                        ctx.summary['failure_reason'] = \
+                            '"{match}" in cluster log'.format(
+                                match=match.rstrip('\n'),
+                            )
+                        break
+
+        for remote, dirs in devs_to_clean.items():
+            for dir_ in dirs:
+                log.info('Unmounting %s on %s' % (dir_, remote))
+                try:
+                    remote.run(
+                        args=[
+                            'sync',
+                            run.Raw('&&'),
+                            'sudo',
+                            'umount',
+                            '-f',
+                            dir_
+                        ]
+                    )
+                except Exception as e:
+                    remote.run(args=[
+                        'sudo',
+                        run.Raw('PATH=/usr/sbin:$PATH'),
+                        'lsof',
+                        run.Raw(';'),
+                        'ps', 'auxf',
+                    ])
+                    raise e
+
+        if ctx.archive is not None and \
+                not (ctx.config.get('archive-on-error') and ctx.summary['success']):
+
+            # archive mon data, too
+            log.info('Archiving mon data...')
+            path = os.path.join(ctx.archive, 'data')
+            try:
+                os.makedirs(path)
+            except OSError as e:
+                if e.errno == errno.EEXIST:
+                    pass
+                else:
+                    raise
+            for remote, roles in mons.remotes.items():
+                for role in roles:
+                    is_mon = teuthology.is_type('mon', cluster_name)
+                    if is_mon(role):
+                        _, _, id_ = teuthology.split_role(role)
+                        mon_dir = DATA_PATH.format(
+                            type_='mon', id_=id_, cluster=cluster_name)
+                        teuthology.pull_directory_tarball(
+                            remote,
+                            mon_dir,
+                            path + '/' + role + '.tgz')
+
+        log.info('Cleaning ceph cluster...')
+        run.wait(
+            ctx.cluster.run(
+                args=[
+                    'sudo',
+                    'rm',
+                    '-rf',
+                    '--',
+                    conf_path,
+                    keyring_path,
+                    data_dir,
+                    monmap_path,
+                    run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
+                ],
+                wait=False,
+            ),
+        )
+
+
+def osd_scrub_pgs(ctx, config):
+    """
+    Scrub pgs when we exit.
+
+    First make sure all pgs are active and clean.
+    Next scrub all osds.
+    Then periodically check until all pgs have scrub time stamps that
+    indicate the last scrub completed.  Time out if no progress is made
+    here after two minutes.
+    """
+    retries = 40
+    delays = 20
+    cluster_name = config['cluster']
+    manager = ctx.managers[cluster_name]
+    for _ in range(retries):
+        stats = manager.get_pg_stats()
+        unclean = [stat['pgid'] for stat in stats if 'active+clean' not in stat['state']]
+        split_merge = []
+        osd_dump = manager.get_osd_dump_json()
+        try:
+            split_merge = [i['pool_name'] for i in osd_dump['pools'] if i['pg_num'] != i['pg_num_target']]
+        except KeyError:
+            # we don't support pg_num_target before nautilus
+            pass
+        if not unclean and not split_merge:
+            break
+        waiting_on = []
+        if unclean:
+            waiting_on.append(f'{unclean} to go clean')
+        if split_merge:
+            waiting_on.append(f'{split_merge} to split/merge')
+        waiting_on = ' and '.join(waiting_on)
+        log.info('Waiting for all PGs to be active+clean and split+merged, waiting on %s', waiting_on)
+        time.sleep(delays)
+    else:
+        raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
+    check_time_now = time.localtime()
+    time.sleep(1)
+    all_roles = teuthology.all_roles(ctx.cluster)
+    for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
+        log.info("Scrubbing {osd}".format(osd=role))
+        _, _, id_ = teuthology.split_role(role)
+        # allow this to fail; in certain cases the OSD might not be up
+        # at this point.  we will catch all pgs below.
+        try:
+            manager.raw_cluster_cmd('tell', 'osd.' + id_, 'config', 'set',
+                                    'osd_debug_deep_scrub_sleep', '0');
+            manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
+        except run.CommandFailedError:
+            pass
+    prev_good = 0
+    gap_cnt = 0
+    loop = True
+    while loop:
+        stats = manager.get_pg_stats()
+        timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats]
+        loop = False
+        thiscnt = 0
+        re_scrub = []
+        for (pgid, tmval) in timez:
+            t = tmval[0:tmval.find('.')].replace(' ', 'T')
+            pgtm = time.strptime(t, '%Y-%m-%dT%H:%M:%S')
+            if pgtm > check_time_now:
+                thiscnt += 1
+            else:
+                log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now)
+                loop = True
+                re_scrub.append(pgid)
+        if thiscnt > prev_good:
+            prev_good = thiscnt
+            gap_cnt = 0
+        else:
+            gap_cnt += 1
+            if gap_cnt % 6 == 0:
+                for pgid in re_scrub:
+                    # re-request scrub every so often in case the earlier
+                    # request was missed.  do not do it every time because
+                    # the scrub may be in progress or not reported yet and
+                    # we will starve progress.
+                    manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
+            if gap_cnt > retries:
+                raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
+        if loop:
+            log.info('Still waiting for all pgs to be scrubbed.')
+            time.sleep(delays)
+
+
+@contextlib.contextmanager
+def run_daemon(ctx, config, type_):
+    """
+    Run daemons for a role type.  Handle the startup and termination of a a daemon.
+    On startup -- set coverages, cpu_profile, valgrind values for all remotes,
+    and a max_mds value for one mds.
+    On cleanup -- Stop all existing daemons of this type.
+
+    :param ctx: Context
+    :param config: Configuration
+    :param type_: Role type
+    """
+    cluster_name = config['cluster']
+    log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
+    testdir = teuthology.get_testdir(ctx)
+    daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
+
+    # check whether any daemons if this type are configured
+    if daemons is None:
+        return
+    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
+
+    daemon_signal = 'kill'
+    if config.get('coverage') or config.get('valgrind') is not None:
+        daemon_signal = 'term'
+
+    # create osds in order.  (this only matters for pre-luminous, which might
+    # be jewel/hammer, which doesn't take an id_ argument to legacy 'osd create').
+    osd_uuids  = {}
+    for remote, roles_for_host in daemons.remotes.items():
+        is_type_ = teuthology.is_type(type_, cluster_name)
+        for role in roles_for_host:
+            if not is_type_(role):
+                continue
+            _, _, id_ = teuthology.split_role(role)
+
+
+            if type_ == 'osd':
+                datadir='/var/lib/ceph/osd/{cluster}-{id}'.format(
+                    cluster=cluster_name, id=id_)
+                osd_uuid = remote.read_file(
+                    datadir + '/fsid', sudo=True).decode().strip()
+                osd_uuids[id_] = osd_uuid
+    for osd_id in range(len(osd_uuids)):
+        id_ = str(osd_id)
+        osd_uuid = osd_uuids.get(id_)
+        try:
+            remote.run(
+                args=[
+                'sudo', 'ceph', '--cluster', cluster_name,
+                    'osd', 'new', osd_uuid, id_,
+                ]
+            )
+        except:
+            # fallback to pre-luminous (jewel)
+            remote.run(
+                args=[
+                'sudo', 'ceph', '--cluster', cluster_name,
+                    'osd', 'create', osd_uuid,
+                ]
+            )
+            if config.get('add_osds_to_crush'):
+                remote.run(
+                args=[
+                    'sudo', 'ceph', '--cluster', cluster_name,
+                    'osd', 'crush', 'create-or-move', 'osd.' + id_,
+                    '1.0', 'host=localhost', 'root=default',
+                ]
+            )
+
+    for remote, roles_for_host in daemons.remotes.items():
+        is_type_ = teuthology.is_type(type_, cluster_name)
+        for role in roles_for_host:
+            if not is_type_(role):
+                continue
+            _, _, id_ = teuthology.split_role(role)
+
+            run_cmd = [
+                'sudo',
+                'adjust-ulimits',
+                'ceph-coverage',
+                coverage_dir,
+                'daemon-helper',
+                daemon_signal,
+            ]
+            run_cmd_tail = [
+                'ceph-%s' % (type_),
+                '-f',
+                '--cluster', cluster_name,
+                '-i', id_]
+
+            if type_ in config.get('cpu_profile', []):
+                profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
+                run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
+
+            vc = config.get('valgrind')
+            if vc is not None:
+                valgrind_args = None
+                if type_ in vc:
+                    valgrind_args = vc[type_]
+                if role in vc:
+                    valgrind_args = vc[role]
+                exit_on_first_error = vc.get('exit_on_first_error', True)
+                run_cmd = get_valgrind_args(testdir, role, run_cmd, valgrind_args,
+                    exit_on_first_error=exit_on_first_error)
+
+            run_cmd.extend(run_cmd_tail)
+            log_path = f'/var/log/ceph/{cluster_name}-{type_}.{id_}.log'
+            create_log_cmd, run_cmd = \
+                maybe_redirect_stderr(config, type_, run_cmd, log_path)
+            if create_log_cmd:
+                remote.sh(create_log_cmd)
+            # always register mgr; don't necessarily start
+            ctx.daemons.register_daemon(
+                remote, type_, id_,
+                cluster=cluster_name,
+                args=run_cmd,
+                logger=log.getChild(role),
+                stdin=run.PIPE,
+                wait=False
+            )
+            if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
+                role = cluster_name + '.' + type_
+                ctx.daemons.get_daemon(type_, id_, cluster_name).restart()
+
+    # kludge: run any pre-manager commands
+    if type_ == 'mon':
+        for cmd in config.get('pre-mgr-commands', []):
+            firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
+            (remote,) = ctx.cluster.only(firstmon).remotes.keys()
+            remote.run(args=cmd.split(' '))
+
+    try:
+        yield
+    finally:
+        teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
+
+
+def healthy(ctx, config):
+    """
+    Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    config = config if isinstance(config, dict) else dict()
+    cluster_name = config.get('cluster', 'ceph')
+    log.info('Waiting until %s daemons up and pgs clean...', cluster_name)
+    manager = ctx.managers[cluster_name]
+    try:
+        manager.wait_for_mgr_available(timeout=30)
+    except (run.CommandFailedError, AssertionError) as e:
+        log.info('ignoring mgr wait error, probably testing upgrade: %s', e)
+
+    manager.wait_for_all_osds_up(timeout=300)
+
+    try:
+        manager.flush_all_pg_stats()
+    except (run.CommandFailedError, Exception) as e:
+        log.info('ignoring flush pg stats error, probably testing upgrade: %s', e)
+    manager.wait_for_clean()
+
+    if config.get('wait-for-healthy', True):
+        log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
+        manager.wait_until_healthy(timeout=300)
+
+    if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
+        # Some MDSs exist, wait for them to be healthy
+        ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
+        ceph_fs.wait_for_daemons(timeout=300)
+
+
+def wait_for_mon_quorum(ctx, config):
+    """
+    Check renote ceph status until all monitors are up.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    if isinstance(config, dict):
+        mons = config['daemons']
+        cluster_name = config.get('cluster', 'ceph')
+    else:
+        assert isinstance(config, list)
+        mons = config
+        cluster_name = 'ceph'
+    firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
+    (remote,) = ctx.cluster.only(firstmon).remotes.keys()
+    with contextutil.safe_while(sleep=10, tries=60,
+                                action='wait for monitor quorum') as proceed:
+        while proceed():
+            quorum_status = remote.sh('sudo ceph quorum_status',
+                                      logger=log.getChild('quorum_status'))
+            j = json.loads(quorum_status)
+            q = j.get('quorum_names', [])
+            log.debug('Quorum: %s', q)
+            if sorted(q) == sorted(mons):
+                break
+
+
+def created_pool(ctx, config):
+    """
+    Add new pools to the dictionary of pools that the ceph-manager
+    knows about.
+    """
+    for new_pool in config:
+        if new_pool not in ctx.managers['ceph'].pools:
+            ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_int_property(
+                new_pool, 'pg_num')
+
+
+@contextlib.contextmanager
+def suppress_mon_health_to_clog(ctx, config):
+    """
+    set the option, and then restore it with its original value
+
+    Note, due to the way how tasks are executed/nested, it's not suggested to
+    use this method as a standalone task. otherwise, it's likely that it will
+    restore the tweaked option at the /end/ of 'tasks' block.
+    """
+    if config.get('mon-health-to-clog', 'true') == 'false':
+        cluster = config.get('cluster', 'ceph')
+        manager = ctx.managers[cluster]
+        manager.raw_cluster_command(
+            'config', 'set', 'mon', 'mon_health_to_clog', 'false'
+        )
+        yield
+        manager.raw_cluster_command(
+            'config', 'rm', 'mon', 'mon_health_to_clog'
+        )
+    else:
+        yield
+
+@contextlib.contextmanager
+def restart(ctx, config):
+    """
+   restart ceph daemons
+
+   For example::
+      tasks:
+      - ceph.restart: [all]
+
+   For example::
+      tasks:
+      - ceph.restart: [osd.0, mon.1, mds.*]
+
+   or::
+
+      tasks:
+      - ceph.restart:
+          daemons: [osd.0, mon.1]
+          wait-for-healthy: false
+          wait-for-osds-up: true
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    if config is None:
+        config = {}
+    elif isinstance(config, list):
+        config = {'daemons': config}
+
+    daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
+    clusters = set()
+
+    with suppress_mon_health_to_clog(ctx, config):
+        for role in daemons:
+            cluster, type_, id_ = teuthology.split_role(role)
+            ctx.daemons.get_daemon(type_, id_, cluster).stop()
+            if type_ == 'osd':
+                ctx.managers[cluster].mark_down_osd(id_)
+            ctx.daemons.get_daemon(type_, id_, cluster).restart()
+            clusters.add(cluster)
+
+    if config.get('wait-for-healthy', True):
+        for cluster in clusters:
+            healthy(ctx=ctx, config=dict(cluster=cluster))
+    if config.get('wait-for-osds-up', False):
+        for cluster in clusters:
+            ctx.managers[cluster].wait_for_all_osds_up()
+    yield
+
+
+@contextlib.contextmanager
+def stop(ctx, config):
+    """
+    Stop ceph daemons
+
+    For example::
+      tasks:
+      - ceph.stop: [mds.*]
+
+      tasks:
+      - ceph.stop: [osd.0, osd.2]
+
+      tasks:
+      - ceph.stop:
+          daemons: [osd.0, osd.2]
+
+    """
+    if config is None:
+        config = {}
+    elif isinstance(config, list):
+        config = {'daemons': config}
+
+    daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
+    clusters = set()
+
+    for role in daemons:
+        cluster, type_, id_ = teuthology.split_role(role)
+        ctx.daemons.get_daemon(type_, id_, cluster).stop()
+        clusters.add(cluster)
+
+
+    for cluster in clusters:
+        ctx.ceph[cluster].watchdog.stop()
+        ctx.ceph[cluster].watchdog.join()
+
+    yield
+
+
+@contextlib.contextmanager
+def wait_for_failure(ctx, config):
+    """
+    Wait for a failure of a ceph daemon
+
+    For example::
+      tasks:
+      - ceph.wait_for_failure: [mds.*]
+
+      tasks:
+      - ceph.wait_for_failure: [osd.0, osd.2]
+
+      tasks:
+      - ceph.wait_for_failure:
+          daemons: [osd.0, osd.2]
+
+    """
+    if config is None:
+        config = {}
+    elif isinstance(config, list):
+        config = {'daemons': config}
+
+    daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
+    for role in daemons:
+        cluster, type_, id_ = teuthology.split_role(role)
+        try:
+            ctx.daemons.get_daemon(type_, id_, cluster).wait()
+        except:
+            log.info('Saw expected daemon failure.  Continuing.')
+            pass
+        else:
+            raise RuntimeError('daemon %s did not fail' % role)
+
+    yield
+
+
+def validate_config(ctx, config):
+    """
+    Perform some simple validation on task configuration.
+    Raises exceptions.ConfigError if an error is found.
+    """
+    # check for osds from multiple clusters on the same host
+    for remote, roles_for_host in ctx.cluster.remotes.items():
+        last_cluster = None
+        last_role = None
+        for role in roles_for_host:
+            role_cluster, role_type, _ = teuthology.split_role(role)
+            if role_type != 'osd':
+                continue
+            if last_cluster and last_cluster != role_cluster:
+                msg = "Host should not have osds (%s and %s) from multiple clusters" % (
+                    last_role, role)
+                raise exceptions.ConfigError(msg)
+            last_cluster = role_cluster
+            last_role = role
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Set up and tear down a Ceph cluster.
+
+    For example::
+
+        tasks:
+        - ceph:
+        - interactive:
+
+    You can also specify what branch to run::
+
+        tasks:
+        - ceph:
+            branch: foo
+
+    Or a tag::
+
+        tasks:
+        - ceph:
+            tag: v0.42.13
+
+    Or a sha1::
+
+        tasks:
+        - ceph:
+            sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
+
+    Or a local source dir::
+
+        tasks:
+        - ceph:
+            path: /home/sage/ceph
+
+    To capture code coverage data, use::
+
+        tasks:
+        - ceph:
+            coverage: true
+
+    To use btrfs, ext4, or xfs on the target's scratch disks, use::
+
+        tasks:
+        - ceph:
+            fs: xfs
+            mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
+            mount_options: [nobarrier, inode64]
+
+    To change the cephfs's default max_mds (1), use::
+
+        tasks:
+        - ceph:
+            cephfs:
+              max_mds: 2
+
+    To change the max_mds of a specific filesystem, use::
+
+        tasks:
+        - ceph:
+            cephfs:
+              max_mds: 2
+              fs:
+                - name: a
+                  max_mds: 3
+                - name: b
+
+    In the above example, filesystem 'a' will have 'max_mds' 3,
+    and filesystme 'b' will have 'max_mds' 2.
+
+    To change the mdsmap's default session_timeout (60 seconds), use::
+
+        tasks:
+        - ceph:
+            cephfs:
+              session_timeout: 300
+
+    Note, this will cause the task to check the /scratch_devs file on each node
+    for available devices.  If no such file is found, /dev/sdb will be used.
+
+    To run some daemons under valgrind, include their names
+    and the tool/args to use in a valgrind section::
+
+        tasks:
+        - ceph:
+          valgrind:
+            mds.1: --tool=memcheck
+            osd.1: [--tool=memcheck, --leak-check=no]
+
+    Those nodes which are using memcheck or valgrind will get
+    checked for bad results.
+
+    To adjust or modify config options, use::
+
+        tasks:
+        - ceph:
+            conf:
+              section:
+                key: value
+
+    For example::
+
+        tasks:
+        - ceph:
+            conf:
+              mds.0:
+                some option: value
+                other key: other value
+              client.0:
+                debug client: 10
+                debug ms: 1
+
+    By default, the cluster log is checked for errors and warnings,
+    and the run marked failed if any appear. You can ignore log
+    entries by giving a list of egrep compatible regexes, i.e.:
+
+        tasks:
+        - ceph:
+            log-ignorelist: ['foo.*bar', 'bad message']
+
+    To run multiple ceph clusters, use multiple ceph tasks, and roles
+    with a cluster name prefix, e.g. cluster1.client.0. Roles with no
+    cluster use the default cluster name, 'ceph'. OSDs from separate
+    clusters must be on separate hosts. Clients and non-osd daemons
+    from multiple clusters may be colocated. For each cluster, add an
+    instance of the ceph task with the cluster name specified, e.g.::
+
+        roles:
+        - [mon.a, osd.0, osd.1]
+        - [backup.mon.a, backup.osd.0, backup.osd.1]
+        - [client.0, backup.client.0]
+        tasks:
+        - ceph:
+            cluster: ceph
+        - ceph:
+            cluster: backup
+
+    :param ctx: Context
+    :param config: Configuration
+
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        "task ceph only supports a dictionary for configuration"
+
+    overrides = ctx.config.get('overrides', {})
+    teuthology.deep_merge(config, overrides.get('ceph', {}))
+
+    first_ceph_cluster = False
+    if not hasattr(ctx, 'daemons'):
+        first_ceph_cluster = True
+        ctx.daemons = DaemonGroup()
+
+    testdir = teuthology.get_testdir(ctx)
+    if config.get('coverage'):
+        coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
+        log.info('Creating coverage directory...')
+        run.wait(
+            ctx.cluster.run(
+                args=[
+                    'install', '-d', '-m0755', '--',
+                    coverage_dir,
+                ],
+                wait=False,
+            )
+        )
+
+    if 'cluster' not in config:
+        config['cluster'] = 'ceph'
+
+    validate_config(ctx, config)
+
+    subtasks = []
+    if first_ceph_cluster:
+        # these tasks handle general log setup and parsing on all hosts,
+        # so they should only be run once
+        subtasks = [
+            lambda: ceph_log(ctx=ctx, config=None),
+            lambda: ceph_crash(ctx=ctx, config=None),
+            lambda: valgrind_post(ctx=ctx, config=config),
+        ]
+
+    subtasks += [
+        lambda: cluster(ctx=ctx, config=dict(
+            conf=config.get('conf', {}),
+            fs=config.get('fs', 'xfs'),
+            mkfs_options=config.get('mkfs_options', None),
+            mount_options=config.get('mount_options', None),
+            skip_mgr_daemons=config.get('skip_mgr_daemons', False),
+            log_ignorelist=config.get('log-ignorelist', []),
+            cpu_profile=set(config.get('cpu_profile', []),),
+            cluster=config['cluster'],
+            mon_bind_msgr2=config.get('mon_bind_msgr2', True),
+            mon_bind_addrvec=config.get('mon_bind_addrvec', True),
+        )),
+        lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
+        lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
+        lambda: crush_setup(ctx=ctx, config=config),
+        lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
+        lambda: setup_manager(ctx=ctx, config=config),
+        lambda: create_rbd_pool(ctx=ctx, config=config),
+        lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
+        lambda: cephfs_setup(ctx=ctx, config=config),
+        lambda: watchdog_setup(ctx=ctx, config=config),
+    ]
+
+    with contextutil.nested(*subtasks):
+        try:
+            if config.get('wait-for-healthy', True):
+                healthy(ctx=ctx, config=dict(cluster=config['cluster']))
+
+            yield
+        finally:
+            # set pg_num_targets back to actual pg_num, so we don't have to
+            # wait for pending merges (which can take a while!)
+            ctx.managers[config['cluster']].stop_pg_num_changes()
+
+            if config.get('wait-for-scrub', True):
+                # wait for pgs to become active+clean in case any
+                # recoveries were triggered since the last health check
+                ctx.managers[config['cluster']].wait_for_clean()
+                osd_scrub_pgs(ctx, config)
+
+            # stop logging health to clog during shutdown, or else we generate
+            # a bunch of scary messages unrelated to our actual run.
+            firstmon = teuthology.get_first_mon(ctx, config, config['cluster'])
+            (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
+            mon0_remote.run(
+                args=[
+                    'sudo',
+                    'ceph',
+                    '--cluster', config['cluster'],
+                    'config', 'set', 'global',
+                    'mon_health_to_clog', 'false',
+                ],
+                check_status=False,
+            )
diff --git a/qa/tasks/ceph_client.py b/qa/tasks/ceph_client.py
new file mode 100644
index 000000000..74e818f93
--- /dev/null
+++ b/qa/tasks/ceph_client.py
@@ -0,0 +1,42 @@
+"""
+Set up client keyring
+"""
+import logging
+
+from teuthology import misc as teuthology
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+def create_keyring(ctx, cluster_name):
+    """
+    Set up key ring on remote sites
+    """
+    log.info('Setting up client nodes...')
+    clients = ctx.cluster.only(teuthology.is_type('client', cluster_name))
+    testdir = teuthology.get_testdir(ctx)
+    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
+    for remote, roles_for_host in clients.remotes.items():
+        for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
+                                                     cluster_name):
+            name = teuthology.ceph_role(role)
+            client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name, name)
+            remote.run(
+                args=[
+                    'sudo',
+                    'adjust-ulimits',
+                    'ceph-coverage',
+                    coverage_dir,
+                    'ceph-authtool',
+                    '--create-keyring',
+                    '--gen-key',
+                    # TODO this --name= is not really obeyed, all unknown "types" are munged to "client"
+                    '--name={name}'.format(name=name),
+                    client_keyring,
+                    run.Raw('&&'),
+                    'sudo',
+                    'chmod',
+                    '0644',
+                    client_keyring,
+                    ],
+                )
diff --git a/qa/tasks/ceph_deploy.py b/qa/tasks/ceph_deploy.py
new file mode 100644
index 000000000..99c8c1ffb
--- /dev/null
+++ b/qa/tasks/ceph_deploy.py
@@ -0,0 +1,916 @@
+"""
+Execute ceph-deploy as a task
+"""
+
+import contextlib
+import os
+import time
+import logging
+import traceback
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.config import config as teuth_config
+from teuthology.task import install as install_fn
+from teuthology.orchestra import run
+from tasks.cephfs.filesystem import Filesystem
+from teuthology.misc import wait_until_healthy
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def download_ceph_deploy(ctx, config):
+    """
+    Downloads ceph-deploy from the ceph.com git mirror and (by default)
+    switches to the master branch. If the `ceph-deploy-branch` is specified, it
+    will use that instead. The `bootstrap` script is ran, with the argument
+    obtained from `python_version`, if specified.
+    """
+    # use mon.a for ceph_admin
+    (ceph_admin,) = ctx.cluster.only('mon.a').remotes.keys()
+
+    try:
+        py_ver = str(config['python_version'])
+    except KeyError:
+        pass
+    else:
+        supported_versions = ['2', '3']
+        if py_ver not in supported_versions:
+            raise ValueError("python_version must be: {}, not {}".format(
+                ' or '.join(supported_versions), py_ver
+            ))
+
+        log.info("Installing Python")
+        system_type = teuthology.get_system_type(ceph_admin)
+
+        if system_type == 'rpm':
+            package = 'python36' if py_ver == '3' else 'python'
+            ctx.cluster.run(args=[
+                'sudo', 'yum', '-y', 'install',
+                package, 'python-virtualenv'
+            ])
+        else:
+            package = 'python3' if py_ver == '3' else 'python'
+            ctx.cluster.run(args=[
+                'sudo', 'apt-get', '-y', '--force-yes', 'install',
+                package, 'python-virtualenv'
+            ])
+
+    log.info('Downloading ceph-deploy...')
+    testdir = teuthology.get_testdir(ctx)
+    ceph_deploy_branch = config.get('ceph-deploy-branch', 'master')
+
+    ceph_admin.run(
+        args=[
+            'git', 'clone', '-b', ceph_deploy_branch,
+            teuth_config.ceph_git_base_url + 'ceph-deploy.git',
+            '{tdir}/ceph-deploy'.format(tdir=testdir),
+        ],
+    )
+    args = [
+        'cd',
+        '{tdir}/ceph-deploy'.format(tdir=testdir),
+        run.Raw('&&'),
+        './bootstrap',
+    ]
+    try:
+        args.append(str(config['python_version']))
+    except KeyError:
+        pass
+    ceph_admin.run(args=args)
+
+    try:
+        yield
+    finally:
+        log.info('Removing ceph-deploy ...')
+        ceph_admin.run(
+            args=[
+                'rm',
+                '-rf',
+                '{tdir}/ceph-deploy'.format(tdir=testdir),
+            ],
+        )
+
+
+def is_healthy(ctx, config):
+    """Wait until a Ceph cluster is healthy."""
+    testdir = teuthology.get_testdir(ctx)
+    ceph_admin = teuthology.get_first_mon(ctx, config)
+    (remote,) = ctx.cluster.only(ceph_admin).remotes.keys()
+    max_tries = 90  # 90 tries * 10 secs --> 15 minutes
+    tries = 0
+    while True:
+        tries += 1
+        if tries >= max_tries:
+            msg = "ceph health was unable to get 'HEALTH_OK' after waiting 15 minutes"
+            remote.run(
+                args=[
+                    'cd',
+                    '{tdir}'.format(tdir=testdir),
+                    run.Raw('&&'),
+                    'sudo', 'ceph',
+                    'report',
+                ],
+            )
+            raise RuntimeError(msg)
+
+        out = remote.sh(
+            [
+                'cd',
+                '{tdir}'.format(tdir=testdir),
+                run.Raw('&&'),
+                'sudo', 'ceph',
+                'health',
+            ],
+            logger=log.getChild('health'),
+        )
+        log.info('Ceph health: %s', out.rstrip('\n'))
+        if out.split(None, 1)[0] == 'HEALTH_OK':
+            break
+        time.sleep(10)
+
+
+def get_nodes_using_role(ctx, target_role):
+    """
+    Extract the names of nodes that match a given role from a cluster, and modify the
+    cluster's service IDs to match the resulting node-based naming scheme that ceph-deploy
+    uses, such that if "mon.a" is on host "foo23", it'll be renamed to "mon.foo23".
+    """
+
+    # Nodes containing a service of the specified role
+    nodes_of_interest = []
+
+    # Prepare a modified version of cluster.remotes with ceph-deploy-ized names
+    modified_remotes = {}
+    ceph_deploy_mapped = dict()
+    for _remote, roles_for_host in ctx.cluster.remotes.items():
+        modified_remotes[_remote] = []
+        for svc_id in roles_for_host:
+            if svc_id.startswith("{0}.".format(target_role)):
+                fqdn = str(_remote).split('@')[-1]
+                nodename = str(str(_remote).split('.')[0]).split('@')[1]
+                if target_role == 'mon':
+                    nodes_of_interest.append(fqdn)
+                else:
+                    nodes_of_interest.append(nodename)
+                mapped_role = "{0}.{1}".format(target_role, nodename)
+                modified_remotes[_remote].append(mapped_role)
+                # keep dict of mapped role for later use by tasks
+                # eg. mon.a => mon.node1
+                ceph_deploy_mapped[svc_id] = mapped_role
+            else:
+                modified_remotes[_remote].append(svc_id)
+
+    ctx.cluster.remotes = modified_remotes
+    # since the function is called multiple times for target roles
+    # append new mapped roles
+    if not hasattr(ctx.cluster, 'mapped_role'):
+        ctx.cluster.mapped_role = ceph_deploy_mapped
+    else:
+        ctx.cluster.mapped_role.update(ceph_deploy_mapped)
+    log.info("New mapped_role={mr}".format(mr=ctx.cluster.mapped_role))
+    return nodes_of_interest
+
+
+def get_dev_for_osd(ctx, config):
+    """Get a list of all osd device names."""
+    osd_devs = []
+    for remote, roles_for_host in ctx.cluster.remotes.items():
+        host = remote.name.split('@')[-1]
+        shortname = host.split('.')[0]
+        devs = teuthology.get_scratch_devices(remote)
+        num_osd_per_host = list(
+            teuthology.roles_of_type(
+                roles_for_host, 'osd'))
+        num_osds = len(num_osd_per_host)
+        if config.get('separate_journal_disk') is not None:
+            num_devs_reqd = 2 * num_osds
+            assert num_devs_reqd <= len(
+                devs), 'fewer data and journal disks than required ' + shortname
+            for dindex in range(0, num_devs_reqd, 2):
+                jd_index = dindex + 1
+                dev_short = devs[dindex].split('/')[-1]
+                jdev_short = devs[jd_index].split('/')[-1]
+                osd_devs.append((shortname, dev_short, jdev_short))
+        else:
+            assert num_osds <= len(devs), 'fewer disks than osds ' + shortname
+            for dev in devs[:num_osds]:
+                dev_short = dev.split('/')[-1]
+                osd_devs.append((shortname, dev_short))
+    return osd_devs
+
+
+def get_all_nodes(ctx, config):
+    """Return a string of node names separated by blanks"""
+    nodelist = []
+    for t, k in ctx.config['targets'].items():
+        host = t.split('@')[-1]
+        simple_host = host.split('.')[0]
+        nodelist.append(simple_host)
+    nodelist = " ".join(nodelist)
+    return nodelist
+
+@contextlib.contextmanager
+def build_ceph_cluster(ctx, config):
+    """Build a ceph cluster"""
+
+    # Expect to find ceph_admin on the first mon by ID, same place that the download task
+    # puts it.  Remember this here, because subsequently IDs will change from those in
+    # the test config to those that ceph-deploy invents.
+
+    (ceph_admin,) = ctx.cluster.only('mon.a').remotes.keys()
+
+    def execute_ceph_deploy(cmd):
+        """Remotely execute a ceph_deploy command"""
+        return ceph_admin.run(
+            args=[
+                'cd',
+                '{tdir}/ceph-deploy'.format(tdir=testdir),
+                run.Raw('&&'),
+                run.Raw(cmd),
+            ],
+            check_status=False,
+        ).exitstatus
+
+    def ceph_disk_osd_create(ctx, config):
+        node_dev_list = get_dev_for_osd(ctx, config)
+        no_of_osds = 0
+        for d in node_dev_list:
+            node = d[0]
+            for disk in d[1:]:
+                zap = './ceph-deploy disk zap ' + node + ' ' + disk
+                estatus = execute_ceph_deploy(zap)
+                if estatus != 0:
+                    raise RuntimeError("ceph-deploy: Failed to zap osds")
+            osd_create_cmd = './ceph-deploy osd create '
+            # first check for filestore, default is bluestore with ceph-deploy
+            if config.get('filestore') is not None:
+                osd_create_cmd += '--filestore '
+            elif config.get('bluestore') is not None:
+                osd_create_cmd += '--bluestore '
+            if config.get('dmcrypt') is not None:
+                osd_create_cmd += '--dmcrypt '
+            osd_create_cmd += ":".join(d)
+            estatus_osd = execute_ceph_deploy(osd_create_cmd)
+            if estatus_osd == 0:
+                log.info('successfully created osd')
+                no_of_osds += 1
+            else:
+                raise RuntimeError("ceph-deploy: Failed to create osds")
+        return no_of_osds
+
+    def ceph_volume_osd_create(ctx, config):
+        osds = ctx.cluster.only(teuthology.is_type('osd'))
+        no_of_osds = 0
+        for remote in osds.remotes.keys():
+            # all devs should be lvm
+            osd_create_cmd = './ceph-deploy osd create --debug ' + remote.shortname + ' '
+            # default is bluestore so we just need config item for filestore
+            roles = ctx.cluster.remotes[remote]
+            dev_needed = len([role for role in roles
+                              if role.startswith('osd')])
+            all_devs = teuthology.get_scratch_devices(remote)
+            log.info("node={n}, need_devs={d}, available={a}".format(
+                        n=remote.shortname,
+                        d=dev_needed,
+                        a=all_devs,
+                        ))
+            devs = all_devs[0:dev_needed]
+            # rest of the devices can be used for journal if required
+            jdevs = dev_needed
+            for device in devs:
+                device_split = device.split('/')
+                lv_device = device_split[-2] + '/' + device_split[-1]
+                if config.get('filestore') is not None:
+                    osd_create_cmd += '--filestore --data ' + lv_device + ' '
+                    # filestore with ceph-volume also needs journal disk
+                    try:
+                        jdevice = all_devs.pop(jdevs)
+                    except IndexError:
+                        raise RuntimeError("No device available for \
+                                            journal configuration")
+                    jdevice_split = jdevice.split('/')
+                    j_lv = jdevice_split[-2] + '/' + jdevice_split[-1]
+                    osd_create_cmd += '--journal ' + j_lv
+                else:
+                    osd_create_cmd += ' --data ' + lv_device
+                estatus_osd = execute_ceph_deploy(osd_create_cmd)
+                if estatus_osd == 0:
+                    log.info('successfully created osd')
+                    no_of_osds += 1
+                else:
+                    raise RuntimeError("ceph-deploy: Failed to create osds")
+        return no_of_osds
+
+    try:
+        log.info('Building ceph cluster using ceph-deploy...')
+        testdir = teuthology.get_testdir(ctx)
+        ceph_branch = None
+        if config.get('branch') is not None:
+            cbranch = config.get('branch')
+            for var, val in cbranch.items():
+                ceph_branch = '--{var}={val}'.format(var=var, val=val)
+        all_nodes = get_all_nodes(ctx, config)
+        mds_nodes = get_nodes_using_role(ctx, 'mds')
+        mds_nodes = " ".join(mds_nodes)
+        mon_node = get_nodes_using_role(ctx, 'mon')
+        mon_nodes = " ".join(mon_node)
+        # skip mgr based on config item
+        # this is needed when test uses latest code to install old ceph
+        # versions
+        skip_mgr = config.get('skip-mgr', False)
+        if not skip_mgr:
+            mgr_nodes = get_nodes_using_role(ctx, 'mgr')
+            mgr_nodes = " ".join(mgr_nodes)
+        new_mon = './ceph-deploy new' + " " + mon_nodes
+        if not skip_mgr:
+            mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes
+        mon_hostname = mon_nodes.split(' ')[0]
+        mon_hostname = str(mon_hostname)
+        gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname
+        deploy_mds = './ceph-deploy mds create' + " " + mds_nodes
+
+        if mon_nodes is None:
+            raise RuntimeError("no monitor nodes in the config file")
+
+        estatus_new = execute_ceph_deploy(new_mon)
+        if estatus_new != 0:
+            raise RuntimeError("ceph-deploy: new command failed")
+
+        log.info('adding config inputs...')
+        testdir = teuthology.get_testdir(ctx)
+        conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir)
+
+        if config.get('conf') is not None:
+            confp = config.get('conf')
+            for section, keys in confp.items():
+                lines = '[{section}]\n'.format(section=section)
+                ceph_admin.sudo_write_file(conf_path, lines, append=True)
+                for key, value in keys.items():
+                    log.info("[%s] %s = %s" % (section, key, value))
+                    lines = '{key} = {value}\n'.format(key=key, value=value)
+                    ceph_admin.sudo_write_file(conf_path, lines, append=True)
+
+        # install ceph
+        dev_branch = ctx.config['branch']
+        branch = '--dev={branch}'.format(branch=dev_branch)
+        if ceph_branch:
+            option = ceph_branch
+        else:
+            option = branch
+        install_nodes = './ceph-deploy install ' + option + " " + all_nodes
+        estatus_install = execute_ceph_deploy(install_nodes)
+        if estatus_install != 0:
+            raise RuntimeError("ceph-deploy: Failed to install ceph")
+        # install ceph-test package too
+        install_nodes2 = './ceph-deploy install --tests ' + option + \
+                         " " + all_nodes
+        estatus_install = execute_ceph_deploy(install_nodes2)
+        if estatus_install != 0:
+            raise RuntimeError("ceph-deploy: Failed to install ceph-test")
+
+        mon_create_nodes = './ceph-deploy mon create-initial'
+        # If the following fails, it is OK, it might just be that the monitors
+        # are taking way more than a minute/monitor to form quorum, so lets
+        # try the next block which will wait up to 15 minutes to gatherkeys.
+        execute_ceph_deploy(mon_create_nodes)
+
+        estatus_gather = execute_ceph_deploy(gather_keys)
+        if estatus_gather != 0:
+            raise RuntimeError("ceph-deploy: Failed during gather keys")
+
+        # install admin key on mons (ceph-create-keys doesn't do this any more)
+        mons = ctx.cluster.only(teuthology.is_type('mon'))
+        for remote in mons.remotes.keys():
+            execute_ceph_deploy('./ceph-deploy admin ' + remote.shortname)
+
+        # create osd's
+        if config.get('use-ceph-volume', False):
+            no_of_osds = ceph_volume_osd_create(ctx, config)
+        else:
+            # this method will only work with ceph-deploy v1.5.39 or older
+            no_of_osds = ceph_disk_osd_create(ctx, config)
+
+        if not skip_mgr:
+            execute_ceph_deploy(mgr_create)
+
+        if mds_nodes:
+            estatus_mds = execute_ceph_deploy(deploy_mds)
+            if estatus_mds != 0:
+                raise RuntimeError("ceph-deploy: Failed to deploy mds")
+
+        if config.get('test_mon_destroy') is not None:
+            for d in range(1, len(mon_node)):
+                mon_destroy_nodes = './ceph-deploy mon destroy' + \
+                    " " + mon_node[d]
+                estatus_mon_d = execute_ceph_deploy(mon_destroy_nodes)
+                if estatus_mon_d != 0:
+                    raise RuntimeError("ceph-deploy: Failed to delete monitor")
+
+
+
+        if config.get('wait-for-healthy', True) and no_of_osds >= 2:
+            is_healthy(ctx=ctx, config=None)
+
+            log.info('Setting up client nodes...')
+            conf_path = '/etc/ceph/ceph.conf'
+            admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring'
+            first_mon = teuthology.get_first_mon(ctx, config)
+            (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys()
+            conf_data = mon0_remote.read_file(conf_path, sudo=True)
+            admin_keyring = mon0_remote.read_file(admin_keyring_path, sudo=True)
+
+            clients = ctx.cluster.only(teuthology.is_type('client'))
+            for remote, roles_for_host in clients.remotes.items():
+                for id_ in teuthology.roles_of_type(roles_for_host, 'client'):
+                    client_keyring = \
+                        '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_)
+                    mon0_remote.run(
+                        args=[
+                            'cd',
+                            '{tdir}'.format(tdir=testdir),
+                            run.Raw('&&'),
+                            'sudo', 'bash', '-c',
+                            run.Raw('"'), 'ceph',
+                            'auth',
+                            'get-or-create',
+                            'client.{id}'.format(id=id_),
+                            'mds', 'allow',
+                            'mon', 'allow *',
+                            'osd', 'allow *',
+                            run.Raw('>'),
+                            client_keyring,
+                            run.Raw('"'),
+                        ],
+                    )
+                    key_data = mon0_remote.read_file(
+                        path=client_keyring,
+                        sudo=True,
+                    )
+                    remote.sudo_write_file(
+                        path=client_keyring,
+                        data=key_data,
+                        mode='0644'
+                    )
+                    remote.sudo_write_file(
+                        path=admin_keyring_path,
+                        data=admin_keyring,
+                        mode='0644'
+                    )
+                    remote.sudo_write_file(
+                        path=conf_path,
+                        data=conf_data,
+                        mode='0644'
+                    )
+
+            if mds_nodes:
+                log.info('Configuring CephFS...')
+                Filesystem(ctx, create=True)
+        elif not config.get('only_mon'):
+            raise RuntimeError(
+                "The cluster is NOT operational due to insufficient OSDs")
+        # create rbd pool
+        ceph_admin.run(
+            args=[
+                'sudo', 'ceph', '--cluster', 'ceph',
+                'osd', 'pool', 'create', 'rbd', '128', '128'],
+            check_status=False)
+        ceph_admin.run(
+            args=[
+                'sudo', 'ceph', '--cluster', 'ceph',
+                'osd', 'pool', 'application', 'enable',
+                'rbd', 'rbd', '--yes-i-really-mean-it'
+                ],
+            check_status=False)
+        yield
+
+    except Exception:
+        log.info(
+            "Error encountered, logging exception before tearing down ceph-deploy")
+        log.info(traceback.format_exc())
+        raise
+    finally:
+        if config.get('keep_running'):
+            return
+        log.info('Stopping ceph...')
+        ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'],
+                        check_status=False)
+        time.sleep(4)
+
+        # and now just check for the processes themselves, as if upstart/sysvinit
+        # is lying to us. Ignore errors if the grep fails
+        ctx.cluster.run(args=['sudo', 'ps', 'aux', run.Raw('|'),
+                              'grep', '-v', 'grep', run.Raw('|'),
+                              'grep', 'ceph'], check_status=False)
+        ctx.cluster.run(args=['sudo', 'systemctl', run.Raw('|'),
+                              'grep', 'ceph'], check_status=False)
+
+        if ctx.archive is not None:
+            # archive mon data, too
+            log.info('Archiving mon data...')
+            path = os.path.join(ctx.archive, 'data')
+            os.makedirs(path)
+            mons = ctx.cluster.only(teuthology.is_type('mon'))
+            for remote, roles in mons.remotes.items():
+                for role in roles:
+                    if role.startswith('mon.'):
+                        teuthology.pull_directory_tarball(
+                            remote,
+                            '/var/lib/ceph/mon',
+                            path + '/' + role + '.tgz')
+
+            log.info('Compressing logs...')
+            run.wait(
+                ctx.cluster.run(
+                    args=[
+                        'sudo',
+                        'find',
+                        '/var/log/ceph',
+                        '-name',
+                        '*.log',
+                        '-print0',
+                        run.Raw('|'),
+                        'sudo',
+                        'xargs',
+                        '-0',
+                        '--no-run-if-empty',
+                        '--',
+                        'gzip',
+                        '--',
+                    ],
+                    wait=False,
+                ),
+            )
+
+            log.info('Archiving logs...')
+            path = os.path.join(ctx.archive, 'remote')
+            os.makedirs(path)
+            for remote in ctx.cluster.remotes.keys():
+                sub = os.path.join(path, remote.shortname)
+                os.makedirs(sub)
+                teuthology.pull_directory(remote, '/var/log/ceph',
+                                          os.path.join(sub, 'log'))
+
+        # Prevent these from being undefined if the try block fails
+        all_nodes = get_all_nodes(ctx, config)
+        purge_nodes = './ceph-deploy purge' + " " + all_nodes
+        purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes
+
+        log.info('Purging package...')
+        execute_ceph_deploy(purge_nodes)
+        log.info('Purging data...')
+        execute_ceph_deploy(purgedata_nodes)
+
+
+@contextlib.contextmanager
+def cli_test(ctx, config):
+    """
+     ceph-deploy cli to exercise most commonly use cli's and ensure
+     all commands works and also startup the init system.
+
+    """
+    log.info('Ceph-deploy Test')
+    if config is None:
+        config = {}
+    test_branch = ''
+    conf_dir = teuthology.get_testdir(ctx) + "/cdtest"
+
+    def execute_cdeploy(admin, cmd, path):
+        """Execute ceph-deploy commands """
+        """Either use git path or repo path """
+        args = ['cd', conf_dir, run.Raw(';')]
+        if path:
+            args.append('{path}/ceph-deploy/ceph-deploy'.format(path=path))
+        else:
+            args.append('ceph-deploy')
+        args.append(run.Raw(cmd))
+        ec = admin.run(args=args, check_status=False).exitstatus
+        if ec != 0:
+            raise RuntimeError(
+                "failed during ceph-deploy cmd: {cmd} , ec={ec}".format(cmd=cmd, ec=ec))
+
+    if config.get('rhbuild'):
+        path = None
+    else:
+        path = teuthology.get_testdir(ctx)
+        # test on branch from config eg: wip-* , master or next etc
+        # packages for all distro's should exist for wip*
+        if ctx.config.get('branch'):
+            branch = ctx.config.get('branch')
+            test_branch = ' --dev={branch} '.format(branch=branch)
+    mons = ctx.cluster.only(teuthology.is_type('mon'))
+    for node, role in mons.remotes.items():
+        admin = node
+        admin.run(args=['mkdir', conf_dir], check_status=False)
+        nodename = admin.shortname
+    system_type = teuthology.get_system_type(admin)
+    if config.get('rhbuild'):
+        admin.run(args=['sudo', 'yum', 'install', 'ceph-deploy', '-y'])
+    log.info('system type is %s', system_type)
+    osds = ctx.cluster.only(teuthology.is_type('osd'))
+
+    for remote, roles in osds.remotes.items():
+        devs = teuthology.get_scratch_devices(remote)
+        log.info("roles %s", roles)
+        if (len(devs) < 3):
+            log.error(
+                'Test needs minimum of 3 devices, only found %s',
+                str(devs))
+            raise RuntimeError("Needs minimum of 3 devices ")
+
+    conf_path = '{conf_dir}/ceph.conf'.format(conf_dir=conf_dir)
+    new_cmd = 'new ' + nodename
+    execute_cdeploy(admin, new_cmd, path)
+    if config.get('conf') is not None:
+        confp = config.get('conf')
+        for section, keys in confp.items():
+            lines = '[{section}]\n'.format(section=section)
+            admin.sudo_write_file(conf_path, lines, append=True)
+            for key, value in keys.items():
+                log.info("[%s] %s = %s" % (section, key, value))
+                lines = '{key} = {value}\n'.format(key=key, value=value)
+                admin.sudo_write_file(conf_path, lines, append=True)
+    new_mon_install = 'install {branch} --mon '.format(
+        branch=test_branch) + nodename
+    new_mgr_install = 'install {branch} --mgr '.format(
+        branch=test_branch) + nodename
+    new_osd_install = 'install {branch} --osd '.format(
+        branch=test_branch) + nodename
+    new_admin = 'install {branch} --cli '.format(branch=test_branch) + nodename
+    create_initial = 'mon create-initial '
+    mgr_create = 'mgr create ' + nodename
+    # either use create-keys or push command
+    push_keys = 'admin ' + nodename
+    execute_cdeploy(admin, new_mon_install, path)
+    execute_cdeploy(admin, new_mgr_install, path)
+    execute_cdeploy(admin, new_osd_install, path)
+    execute_cdeploy(admin, new_admin, path)
+    execute_cdeploy(admin, create_initial, path)
+    execute_cdeploy(admin, mgr_create, path)
+    execute_cdeploy(admin, push_keys, path)
+
+    for i in range(3):
+        zap_disk = 'disk zap ' + "{n}:{d}".format(n=nodename, d=devs[i])
+        prepare = 'osd prepare ' + "{n}:{d}".format(n=nodename, d=devs[i])
+        execute_cdeploy(admin, zap_disk, path)
+        execute_cdeploy(admin, prepare, path)
+
+    log.info("list files for debugging purpose to check file permissions")
+    admin.run(args=['ls', run.Raw('-lt'), conf_dir])
+    remote.run(args=['sudo', 'ceph', '-s'], check_status=False)
+    out = remote.sh('sudo ceph health')
+    log.info('Ceph health: %s', out.rstrip('\n'))
+    log.info("Waiting for cluster to become healthy")
+    with contextutil.safe_while(sleep=10, tries=6,
+                                action='check health') as proceed:
+        while proceed():
+            out = remote.sh('sudo ceph health')
+            if (out.split(None, 1)[0] == 'HEALTH_OK'):
+                break
+    rgw_install = 'install {branch} --rgw {node}'.format(
+        branch=test_branch,
+        node=nodename,
+    )
+    rgw_create = 'rgw create ' + nodename
+    execute_cdeploy(admin, rgw_install, path)
+    execute_cdeploy(admin, rgw_create, path)
+    log.info('All ceph-deploy cli tests passed')
+    try:
+        yield
+    finally:
+        log.info("cleaning up")
+        ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'],
+                        check_status=False)
+        time.sleep(4)
+        for i in range(3):
+            umount_dev = "{d}1".format(d=devs[i])
+            remote.run(args=['sudo', 'umount', run.Raw(umount_dev)])
+        cmd = 'purge ' + nodename
+        execute_cdeploy(admin, cmd, path)
+        cmd = 'purgedata ' + nodename
+        execute_cdeploy(admin, cmd, path)
+        log.info("Removing temporary dir")
+        admin.run(
+            args=[
+                'rm',
+                run.Raw('-rf'),
+                run.Raw(conf_dir)],
+            check_status=False)
+        if config.get('rhbuild'):
+            admin.run(args=['sudo', 'yum', 'remove', 'ceph-deploy', '-y'])
+
+
+@contextlib.contextmanager
+def single_node_test(ctx, config):
+    """
+    - ceph-deploy.single_node_test: null
+
+    #rhbuild testing
+    - ceph-deploy.single_node_test:
+        rhbuild: 1.2.3
+
+    """
+    log.info("Testing ceph-deploy on single node")
+    if config is None:
+        config = {}
+    overrides = ctx.config.get('overrides', {})
+    teuthology.deep_merge(config, overrides.get('ceph-deploy', {}))
+
+    if config.get('rhbuild'):
+        log.info("RH Build, Skip Download")
+        with contextutil.nested(
+            lambda: cli_test(ctx=ctx, config=config),
+        ):
+            yield
+    else:
+        with contextutil.nested(
+            lambda: install_fn.ship_utilities(ctx=ctx, config=None),
+            lambda: download_ceph_deploy(ctx=ctx, config=config),
+            lambda: cli_test(ctx=ctx, config=config),
+        ):
+            yield
+
+
+@contextlib.contextmanager
+def upgrade(ctx, config):
+    """
+     Upgrade using ceph-deploy
+     eg:
+       ceph-deploy.upgrade:
+          # to upgrade to specific branch, use
+          branch:
+             stable: jewel
+           # to setup mgr node, use
+           setup-mgr-node: True
+           # to wait for cluster to be healthy after all upgrade, use
+           wait-for-healthy: True
+           role: (upgrades the below roles serially)
+              mon.a
+              mon.b
+              osd.0
+     """
+    roles = config.get('roles')
+    # get the roles that are mapped as per ceph-deploy
+    # roles are mapped for mon/mds eg: mon.a  => mon.host_short_name
+    mapped_role = ctx.cluster.mapped_role
+    log.info("roles={r}, mapped_roles={mr}".format(r=roles, mr=mapped_role))
+    if config.get('branch'):
+        branch = config.get('branch')
+        (var, val) = branch.items()[0]
+        ceph_branch = '--{var}={val}'.format(var=var, val=val)
+    else:
+        # default to wip-branch under test
+        dev_branch = ctx.config['branch']
+        ceph_branch = '--dev={branch}'.format(branch=dev_branch)
+    # get the node used for initial deployment which is mon.a
+    mon_a = mapped_role.get('mon.a')
+    (ceph_admin,) = ctx.cluster.only(mon_a).remotes.keys()
+    testdir = teuthology.get_testdir(ctx)
+    cmd = './ceph-deploy install ' + ceph_branch
+    for role in roles:
+        # check if this role is mapped (mon or mds)
+        if mapped_role.get(role):
+            role = mapped_role.get(role)
+        remotes_and_roles = ctx.cluster.only(role).remotes
+        for remote, roles in remotes_and_roles.items():
+            nodename = remote.shortname
+            cmd = cmd + ' ' + nodename
+            log.info("Upgrading ceph on  %s", nodename)
+            ceph_admin.run(
+                args=[
+                    'cd',
+                    '{tdir}/ceph-deploy'.format(tdir=testdir),
+                    run.Raw('&&'),
+                    run.Raw(cmd),
+                ],
+            )
+            # restart all ceph services, ideally upgrade should but it does not
+            remote.run(
+                args=[
+                    'sudo', 'systemctl', 'restart', 'ceph.target'
+                ]
+            )
+            ceph_admin.run(args=['sudo', 'ceph', '-s'])
+
+    # workaround for http://tracker.ceph.com/issues/20950
+    # write the correct mgr key to disk
+    if config.get('setup-mgr-node', None):
+        mons = ctx.cluster.only(teuthology.is_type('mon'))
+        for remote, roles in mons.remotes.items():
+            remote.run(
+                args=[
+                    run.Raw('sudo ceph auth get client.bootstrap-mgr'),
+                    run.Raw('|'),
+                    run.Raw('sudo tee'),
+                    run.Raw('/var/lib/ceph/bootstrap-mgr/ceph.keyring')
+                ]
+            )
+
+    if config.get('setup-mgr-node', None):
+        mgr_nodes = get_nodes_using_role(ctx, 'mgr')
+        mgr_nodes = " ".join(mgr_nodes)
+        mgr_install = './ceph-deploy install --mgr ' + ceph_branch + " " + mgr_nodes
+        mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes
+        # install mgr
+        ceph_admin.run(
+            args=[
+                'cd',
+                '{tdir}/ceph-deploy'.format(tdir=testdir),
+                run.Raw('&&'),
+                run.Raw(mgr_install),
+                ],
+            )
+        # create mgr
+        ceph_admin.run(
+            args=[
+                'cd',
+                '{tdir}/ceph-deploy'.format(tdir=testdir),
+                run.Raw('&&'),
+                run.Raw(mgr_create),
+                ],
+            )
+        ceph_admin.run(args=['sudo', 'ceph', '-s'])
+    if config.get('wait-for-healthy', None):
+        wait_until_healthy(ctx, ceph_admin, use_sudo=True)
+    yield
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Set up and tear down a Ceph cluster.
+
+    For example::
+
+        tasks:
+        - install:
+             extras: yes
+        - ssh_keys:
+        - ceph-deploy:
+             branch:
+                stable: bobtail
+             mon_initial_members: 1
+             ceph-deploy-branch: my-ceph-deploy-branch
+             only_mon: true
+             keep_running: true
+             # either choose bluestore or filestore, default is bluestore
+             bluestore: True
+             # or
+             filestore: True
+             # skip install of mgr for old release using below flag
+             skip-mgr: True  ( default is False )
+             # to use ceph-volume instead of ceph-disk
+             # ceph-disk can only be used with old ceph-deploy release from pypi
+             use-ceph-volume: true
+
+        tasks:
+        - install:
+             extras: yes
+        - ssh_keys:
+        - ceph-deploy:
+             branch:
+                dev: master
+             conf:
+                mon:
+                   debug mon = 20
+
+        tasks:
+        - install:
+             extras: yes
+        - ssh_keys:
+        - ceph-deploy:
+             branch:
+                testing:
+             dmcrypt: yes
+             separate_journal_disk: yes
+
+    """
+    if config is None:
+        config = {}
+
+    assert isinstance(config, dict), \
+        "task ceph-deploy only supports a dictionary for configuration"
+
+    overrides = ctx.config.get('overrides', {})
+    teuthology.deep_merge(config, overrides.get('ceph-deploy', {}))
+
+    if config.get('branch') is not None:
+        assert isinstance(
+            config['branch'], dict), 'branch must be a dictionary'
+
+    log.info('task ceph-deploy with config ' + str(config))
+
+    # we need to use 1.5.39-stable for testing jewel or master branch with
+    # ceph-disk
+    if config.get('use-ceph-volume', False) is False:
+        # check we are not testing specific branch
+        if config.get('ceph-deploy-branch', False) is False:
+            config['ceph-deploy-branch'] = '1.5.39-stable'
+
+    with contextutil.nested(
+        lambda: install_fn.ship_utilities(ctx=ctx, config=None),
+        lambda: download_ceph_deploy(ctx=ctx, config=config),
+        lambda: build_ceph_cluster(ctx=ctx, config=config),
+    ):
+        yield
diff --git a/qa/tasks/ceph_fuse.py b/qa/tasks/ceph_fuse.py
new file mode 100644
index 000000000..d2db29732
--- /dev/null
+++ b/qa/tasks/ceph_fuse.py
@@ -0,0 +1,175 @@
+"""
+Ceph FUSE client task
+"""
+
+import contextlib
+import logging
+
+from teuthology import misc
+from tasks.cephfs.fuse_mount import FuseMount
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Mount/unmount a ``ceph-fuse`` client.
+
+    The config is optional and defaults to mounting on all clients. If
+    a config is given, it is expected to be a list of clients to do
+    this operation on. This lets you e.g. set up one client with
+    ``ceph-fuse`` and another with ``kclient``.
+
+    ``brxnet`` should be a Private IPv4 Address range, default range is
+    [192.168.0.0/16]
+
+    Example that mounts all clients::
+
+        tasks:
+        - ceph:
+        - ceph-fuse:
+        - interactive:
+        - brxnet: [192.168.0.0/16]
+
+    Example that uses both ``kclient` and ``ceph-fuse``::
+
+        tasks:
+        - ceph:
+        - ceph-fuse: [client.0]
+        - kclient: [client.1]
+        - interactive:
+
+    Example that enables valgrind:
+
+        tasks:
+        - ceph:
+        - ceph-fuse:
+            client.0:
+              valgrind: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
+        - interactive:
+
+    Example that stops an already-mounted client:
+
+    ::
+
+        tasks:
+            - ceph:
+            - ceph-fuse: [client.0]
+            - ... do something that requires the FS mounted ...
+            - ceph-fuse:
+                client.0:
+                    mounted: false
+            - ... do something that requires the FS unmounted ...
+
+    Example that adds more generous wait time for mount (for virtual machines):
+
+        tasks:
+        - ceph:
+        - ceph-fuse:
+            client.0:
+              mount_wait: 60 # default is 0, do not wait before checking /sys/
+              mount_timeout: 120 # default is 30, give up if /sys/ is not populated
+        - interactive:
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    log.info('Running ceph_fuse task...')
+
+    if config is None:
+        ids = misc.all_roles_of_type(ctx.cluster, 'client')
+        client_roles = [f'client.{id_}' for id_ in ids]
+        config = dict([r, dict()] for r in client_roles)
+    elif isinstance(config, list):
+        client_roles = config
+        config = dict([r, dict()] for r in client_roles)
+    elif isinstance(config, dict):
+        client_roles = filter(lambda x: 'client.' in x, config.keys())
+    else:
+        raise ValueError(f"Invalid config object: {config} ({config.__class__})")
+    log.info(f"config is {config}")
+
+    clients = list(misc.get_clients(ctx=ctx, roles=client_roles))
+    testdir = misc.get_testdir(ctx)
+    all_mounts = getattr(ctx, 'mounts', {})
+    mounted_by_me = {}
+    skipped = {}
+    remotes = set()
+
+    brxnet = config.get("brxnet", None)
+
+    # Construct any new FuseMount instances
+    overrides = ctx.config.get('overrides', {}).get('ceph-fuse', {})
+    top_overrides = dict(filter(lambda x: 'client.' not in x[0], overrides.items()))
+    for id_, remote in clients:
+        entity = f"client.{id_}"
+        client_config = config.get(entity)
+        if client_config is None:
+            client_config = {}
+        # top level overrides
+        misc.deep_merge(client_config, top_overrides)
+        # mount specific overrides
+        client_config_overrides = overrides.get(entity)
+        misc.deep_merge(client_config, client_config_overrides)
+        log.info(f"{entity} config is {client_config}")
+
+        remotes.add(remote)
+        auth_id = client_config.get("auth_id", id_)
+        cephfs_name = client_config.get("cephfs_name")
+
+        skip = client_config.get("skip", False)
+        if skip:
+            skipped[id_] = skip
+            continue
+
+        if id_ not in all_mounts:
+            fuse_mount = FuseMount(ctx=ctx, client_config=client_config,
+                                   test_dir=testdir, client_id=auth_id,
+                                   client_remote=remote, brxnet=brxnet,
+                                   cephfs_name=cephfs_name)
+            all_mounts[id_] = fuse_mount
+        else:
+            # Catch bad configs where someone has e.g. tried to use ceph-fuse and kcephfs for the same client
+            assert isinstance(all_mounts[id_], FuseMount)
+
+        if not config.get("disabled", False) and client_config.get('mounted', True):
+            mounted_by_me[id_] = {"config": client_config, "mount": all_mounts[id_]}
+
+    ctx.mounts = all_mounts
+
+    # Umount any pre-existing clients that we have not been asked to mount
+    for client_id in set(all_mounts.keys()) - set(mounted_by_me.keys()) - set(skipped.keys()):
+        mount = all_mounts[client_id]
+        if mount.is_mounted():
+            mount.umount_wait()
+
+    for remote in remotes:
+        FuseMount.cleanup_stale_netnses_and_bridge(remote)
+
+    # Mount any clients we have been asked to (default to mount all)
+    log.info('Mounting ceph-fuse clients...')
+    for info in mounted_by_me.values():
+        config = info["config"]
+        mount_x = info['mount']
+        if config.get("mount_path"):
+            mount_x.cephfs_mntpt = config.get("mount_path")
+        if config.get("mountpoint"):
+            mount_x.hostfs_mntpt = config.get("mountpoint")
+        mount_x.mount(createfs=False)
+
+    for info in mounted_by_me.values():
+        info["mount"].wait_until_mounted()
+
+    try:
+        yield all_mounts
+    finally:
+        log.info('Unmounting ceph-fuse clients...')
+
+        for info in mounted_by_me.values():
+            # Conditional because an inner context might have umounted it
+            mount = info["mount"]
+            if mount.is_mounted():
+                mount.umount_wait()
+        for remote in remotes:
+            FuseMount.cleanup_stale_netnses_and_bridge(remote)
diff --git a/qa/tasks/ceph_iscsi.py b/qa/tasks/ceph_iscsi.py
new file mode 100644
index 000000000..a8870743a
--- /dev/null
+++ b/qa/tasks/ceph_iscsi.py
@@ -0,0 +1,141 @@
+"""
+Run ceph-iscsi cluster setup
+"""
+import logging
+import contextlib
+from io import StringIO
+from teuthology.exceptions import CommandFailedError, ConnectionLostError
+from teuthology.orchestra import run
+from textwrap import dedent
+
+log = logging.getLogger(__name__)
+
+class IscsiSetup(object):
+    def __init__(self, ctx, config):
+        self.ctx = ctx
+        self.config = config
+        self.target_iqn = "iqn.2003-01.com.redhat.iscsi-gw:ceph-gw"
+        self.client_iqn = "iqn.1994-05.com.redhat:client"
+        self.trusted_ip_list = []
+        self.background_procs = []
+
+    def run_daemon(self, remote, cmds):
+        p = remote.run(args=['sudo', 'adjust-ulimits', 'daemon-helper', 'kill', cmds],
+                          wait=False, stdin=run.PIPE, stdout=StringIO())
+        self.background_procs.append(p)
+
+    def _kill_background(self, p):
+        if p.stdin:
+            p.stdin.close()
+            try:
+                p.wait()
+            except (CommandFailedError, ConnectionLostError):
+                pass
+
+    def kill_backgrounds(self):
+        for p in self.background_procs:
+            self._kill_background(p)
+        self.background_procs = []
+
+    def _setup_iscsi_gateway_cfg(self, role):
+        # setup the iscsi-gateway.cfg file, we only set the
+        # clust_name and trusted_ip_list and all the others
+        # as default
+        ips = ','.join(self.trusted_ip_list)
+        conf = dedent(f'''\
+[config]
+cluster_name = ceph
+pool = rbd
+api_secure = false
+api_port = 5000
+trusted_ip_list = {ips}
+''')
+        path = "/etc/ceph/iscsi-gateway.cfg"
+        (remote,) = (self.ctx.cluster.only(role).remotes.keys())
+        remote.sudo_write_file(path, conf)
+
+    def _setup_gateway(self, role):
+        """Spawned task that setups the gateway"""
+        (remote,) = (self.ctx.cluster.only(role).remotes.keys())
+
+        self._setup_iscsi_gateway_cfg(role)
+
+        self.run_daemon(remote, "/usr/bin/tcmu-runner")
+        self.run_daemon(remote, "/usr/bin/rbd-target-gw")
+        self.run_daemon(remote, "/usr/bin/rbd-target-api")
+
+    def setup_gateways(self):
+        for role in self.config['gateways']:
+            (remote,) = (self.ctx.cluster.only(role).remotes.keys())
+            self.trusted_ip_list.append(remote.ip_address)
+
+        for role in self.config['gateways']:
+            self._setup_gateway(role)
+
+    def _setup_client(self, role):
+        """Spawned task that setups the gateway"""
+        (remote,) = (self.ctx.cluster.only(role).remotes.keys())
+
+        # copy the "iscsi-gateway.cfg" to client and will be
+        # used to get the IPs
+        self._setup_iscsi_gateway_cfg(role)
+
+        conf = dedent(f'''
+InitiatorName={self.client_iqn}
+''')
+        path = "/etc/iscsi/initiatorname.iscsi"
+        remote.sudo_write_file(path, conf, mkdir=True)
+
+        # the restart is needed after the above change is applied
+        remote.run(args=['sudo', 'systemctl', 'restart', 'iscsid'])
+
+        remote.run(args=['sudo', 'modprobe', 'dm_multipath'])
+        remote.run(args=['sudo', 'mpathconf', '--enable'])
+        conf = dedent('''\
+devices {
+        device {
+                vendor                 "LIO-ORG"
+                product                "LIO-ORG"
+                hardware_handler       "1 alua"
+                path_grouping_policy   "failover"
+                path_selector          "queue-length 0"
+                failback               60
+                path_checker           tur
+                prio                   alua
+                prio_args              exclusive_pref_bit
+                fast_io_fail_tmo       25
+                no_path_retry          queue
+        }
+}
+''')
+        path = "/etc/multipath.conf"
+        remote.sudo_write_file(path, conf, append=True)
+        remote.run(args=['sudo', 'systemctl', 'start', 'multipathd'])
+
+    def setup_clients(self):
+        for role in self.config['clients']:
+            self._setup_client(role)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run ceph iscsi setup.
+
+    Specify the list of gateways to run ::
+
+      tasks:
+        ceph_iscsi:
+          gateways: [a_gateway.0, c_gateway.1]
+          clients: [b_client.0]
+
+    """
+    log.info('Setting ceph iscsi cluster...')
+    iscsi = IscsiSetup(ctx, config)
+    iscsi.setup_gateways()
+    iscsi.setup_clients()
+
+    try:
+        yield
+    finally:
+        log.info('Ending ceph iscsi daemons')
+        iscsi.kill_backgrounds()
diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py
new file mode 100644
index 000000000..bb1abfd51
--- /dev/null
+++ b/qa/tasks/ceph_manager.py
@@ -0,0 +1,3221 @@
+"""
+ceph manager -- Thrasher and CephManager objects
+"""
+from functools import wraps
+import contextlib
+import errno
+import random
+import signal
+import time
+import gevent
+import base64
+import json
+import logging
+import threading
+import traceback
+import os
+import shlex
+
+from io import BytesIO, StringIO
+from subprocess import DEVNULL
+from teuthology import misc as teuthology
+from tasks.scrub import Scrubber
+from tasks.util.rados import cmd_erasure_code_profile
+from tasks.util import get_remote
+from teuthology.contextutil import safe_while
+from teuthology.orchestra.remote import Remote
+from teuthology.orchestra import run
+from teuthology.exceptions import CommandFailedError
+from tasks.thrasher import Thrasher
+
+
+DEFAULT_CONF_PATH = '/etc/ceph/ceph.conf'
+
+log = logging.getLogger(__name__)
+
+# this is for cephadm clusters
+def shell(ctx, cluster_name, remote, args, name=None, **kwargs):
+    extra_args = []
+    if name:
+        extra_args = ['-n', name]
+    return remote.run(
+        args=[
+            'sudo',
+            ctx.cephadm,
+            '--image', ctx.ceph[cluster_name].image,
+            'shell',
+        ] + extra_args + [
+            '--fsid', ctx.ceph[cluster_name].fsid,
+            '--',
+        ] + args,
+        **kwargs
+    )
+
+# this is for rook clusters
+def toolbox(ctx, cluster_name, args, **kwargs):
+    return ctx.rook[cluster_name].remote.run(
+        args=[
+            'kubectl',
+            '-n', 'rook-ceph',
+            'exec',
+            ctx.rook[cluster_name].toolbox,
+            '--',
+        ] + args,
+        **kwargs
+    )
+
+
+def write_conf(ctx, conf_path=DEFAULT_CONF_PATH, cluster='ceph'):
+    conf_fp = BytesIO()
+    ctx.ceph[cluster].conf.write(conf_fp)
+    conf_fp.seek(0)
+    writes = ctx.cluster.run(
+        args=[
+            'sudo', 'mkdir', '-p', '/etc/ceph', run.Raw('&&'),
+            'sudo', 'chmod', '0755', '/etc/ceph', run.Raw('&&'),
+            'sudo', 'tee', conf_path, run.Raw('&&'),
+            'sudo', 'chmod', '0644', conf_path,
+            run.Raw('>'), '/dev/null',
+
+        ],
+        stdin=run.PIPE,
+        wait=False)
+    teuthology.feed_many_stdins_and_close(conf_fp, writes)
+    run.wait(writes)
+
+def get_valgrind_args(testdir, name, preamble, v, exit_on_first_error=True, cd=True):
+    """
+    Build a command line for running valgrind.
+
+    testdir - test results directory
+    name - name of daemon (for naming hte log file)
+    preamble - stuff we should run before valgrind
+    v - valgrind arguments
+    """
+    if v is None:
+        return preamble
+    if not isinstance(v, list):
+        v = [v]
+
+    # https://tracker.ceph.com/issues/44362
+    preamble.extend([
+        'env', 'OPENSSL_ia32cap=~0x1000000000000000',
+    ])
+
+    val_path = '/var/log/ceph/valgrind'
+    if '--tool=memcheck' in v or '--tool=helgrind' in v:
+        extra_args = [
+            'valgrind',
+            '--trace-children=no',
+            '--child-silent-after-fork=yes',
+            '--soname-synonyms=somalloc=*tcmalloc*',
+            '--num-callers=50',
+            '--suppressions={tdir}/valgrind.supp'.format(tdir=testdir),
+            '--xml=yes',
+            '--xml-file={vdir}/{n}.log'.format(vdir=val_path, n=name),
+            '--time-stamp=yes',
+            '--vgdb=yes',
+        ]
+    else:
+        extra_args = [
+            'valgrind',
+            '--trace-children=no',
+            '--child-silent-after-fork=yes',
+            '--soname-synonyms=somalloc=*tcmalloc*',
+            '--suppressions={tdir}/valgrind.supp'.format(tdir=testdir),
+            '--log-file={vdir}/{n}.log'.format(vdir=val_path, n=name),
+            '--time-stamp=yes',
+            '--vgdb=yes',
+        ]
+    if exit_on_first_error:
+        extra_args.extend([
+            # at least Valgrind 3.14 is required
+            '--exit-on-first-error=yes',
+            '--error-exitcode=42',
+        ])
+    args = []
+    if cd:
+        args += ['cd', testdir, run.Raw('&&')]
+    args += preamble + extra_args + v
+    log.debug('running %s under valgrind with args %s', name, args)
+    return args
+
+
+def mount_osd_data(ctx, remote, cluster, osd):
+    """
+    Mount a remote OSD
+
+    :param ctx: Context
+    :param remote: Remote site
+    :param cluster: name of ceph cluster
+    :param osd: Osd name
+    """
+    log.debug('Mounting data for osd.{o} on {r}'.format(o=osd, r=remote))
+    role = "{0}.osd.{1}".format(cluster, osd)
+    alt_role = role if cluster != 'ceph' else "osd.{0}".format(osd)
+    if remote in ctx.disk_config.remote_to_roles_to_dev:
+        if alt_role in ctx.disk_config.remote_to_roles_to_dev[remote]:
+            role = alt_role
+        if role not in ctx.disk_config.remote_to_roles_to_dev[remote]:
+            return
+        dev = ctx.disk_config.remote_to_roles_to_dev[remote][role]
+        mount_options = ctx.disk_config.\
+            remote_to_roles_to_dev_mount_options[remote][role]
+        fstype = ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role]
+        mnt = os.path.join('/var/lib/ceph/osd', '{0}-{1}'.format(cluster, osd))
+
+        log.info('Mounting osd.{o}: dev: {n}, cluster: {c}'
+                 'mountpoint: {p}, type: {t}, options: {v}'.format(
+                     o=osd, n=remote.name, p=mnt, t=fstype, v=mount_options,
+                     c=cluster))
+
+        remote.run(
+            args=[
+                'sudo',
+                'mount',
+                '-t', fstype,
+                '-o', ','.join(mount_options),
+                dev,
+                mnt,
+            ]
+            )
+
+
+def log_exc(func):
+    @wraps(func)
+    def wrapper(self):
+        try:
+            return func(self)
+        except:
+            self.log(traceback.format_exc())
+            raise
+    return wrapper
+
+
+class PoolType:
+    REPLICATED = 1
+    ERASURE_CODED = 3
+
+
+class OSDThrasher(Thrasher):
+    """
+    Object used to thrash Ceph
+    """
+    def __init__(self, manager, config, name, logger):
+        super(OSDThrasher, self).__init__()
+
+        self.ceph_manager = manager
+        self.cluster = manager.cluster
+        self.ceph_manager.wait_for_clean()
+        osd_status = self.ceph_manager.get_osd_status()
+        self.in_osds = osd_status['in']
+        self.live_osds = osd_status['live']
+        self.out_osds = osd_status['out']
+        self.dead_osds = osd_status['dead']
+        self.stopping = False
+        self.logger = logger
+        self.config = config
+        self.name = name
+        self.revive_timeout = self.config.get("revive_timeout", 360)
+        self.pools_to_fix_pgp_num = set()
+        if self.config.get('powercycle'):
+            self.revive_timeout += 120
+        self.clean_wait = self.config.get('clean_wait', 0)
+        self.minin = self.config.get("min_in", 4)
+        self.chance_move_pg = self.config.get('chance_move_pg', 1.0)
+        self.sighup_delay = self.config.get('sighup_delay')
+        self.optrack_toggle_delay = self.config.get('optrack_toggle_delay')
+        self.dump_ops_enable = self.config.get('dump_ops_enable')
+        self.noscrub_toggle_delay = self.config.get('noscrub_toggle_delay')
+        self.chance_thrash_cluster_full = self.config.get('chance_thrash_cluster_full', .05)
+        self.chance_thrash_pg_upmap = self.config.get('chance_thrash_pg_upmap', 1.0)
+        self.chance_thrash_pg_upmap_items = self.config.get('chance_thrash_pg_upmap', 1.0)
+        self.random_eio = self.config.get('random_eio')
+        self.chance_force_recovery = self.config.get('chance_force_recovery', 0.3)
+
+        num_osds = self.in_osds + self.out_osds
+        self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * len(num_osds)
+        self.min_pgs = self.config.get("min_pgs_per_pool_osd", 1) * len(num_osds)
+        if self.config is None:
+            self.config = dict()
+        # prevent monitor from auto-marking things out while thrasher runs
+        # try both old and new tell syntax, in case we are testing old code
+        self.saved_options = []
+        # assuming that the default settings do not vary from one daemon to
+        # another
+        first_mon = teuthology.get_first_mon(manager.ctx, self.config).split('.')
+        opts = [('mon', 'mon_osd_down_out_interval', 0)]
+        #why do we disable marking an OSD out automatically? :/
+        for service, opt, new_value in opts:
+            old_value = manager.get_config(first_mon[0],
+                                           first_mon[1],
+                                           opt)
+            self.saved_options.append((service, opt, old_value))
+            manager.inject_args(service, '*', opt, new_value)
+        # initialize ceph_objectstore_tool property - must be done before
+        # do_thrash is spawned - http://tracker.ceph.com/issues/18799
+        if (self.config.get('powercycle') or
+            not self.cmd_exists_on_osds("ceph-objectstore-tool") or
+            self.config.get('disable_objectstore_tool_tests', False)):
+            self.ceph_objectstore_tool = False
+            if self.config.get('powercycle'):
+                self.log("Unable to test ceph-objectstore-tool, "
+                         "powercycle testing")
+            else:
+                self.log("Unable to test ceph-objectstore-tool, "
+                         "not available on all OSD nodes")
+        else:
+            self.ceph_objectstore_tool = \
+                self.config.get('ceph_objectstore_tool', True)
+        # spawn do_thrash
+        self.thread = gevent.spawn(self.do_thrash)
+        if self.sighup_delay:
+            self.sighup_thread = gevent.spawn(self.do_sighup)
+        if self.optrack_toggle_delay:
+            self.optrack_toggle_thread = gevent.spawn(self.do_optrack_toggle)
+        if self.dump_ops_enable == "true":
+            self.dump_ops_thread = gevent.spawn(self.do_dump_ops)
+        if self.noscrub_toggle_delay:
+            self.noscrub_toggle_thread = gevent.spawn(self.do_noscrub_toggle)
+
+    def log(self, msg, *args, **kwargs):
+        self.logger.info(msg, *args, **kwargs)
+
+    def cmd_exists_on_osds(self, cmd):
+        if self.ceph_manager.cephadm or self.ceph_manager.rook:
+            return True
+        allremotes = self.ceph_manager.ctx.cluster.only(\
+            teuthology.is_type('osd', self.cluster)).remotes.keys()
+        allremotes = list(set(allremotes))
+        for remote in allremotes:
+            proc = remote.run(args=['type', cmd], wait=True,
+                              check_status=False, stdout=BytesIO(),
+                              stderr=BytesIO())
+            if proc.exitstatus != 0:
+                return False;
+        return True;
+
+    def run_ceph_objectstore_tool(self, remote, osd, cmd):
+        if self.ceph_manager.cephadm:
+            return shell(
+                self.ceph_manager.ctx, self.ceph_manager.cluster, remote,
+                args=['ceph-objectstore-tool'] + cmd,
+                name=osd,
+                wait=True, check_status=False,
+                stdout=StringIO(),
+                stderr=StringIO())
+        elif self.ceph_manager.rook:
+            assert False, 'not implemented'
+        else:
+            return remote.run(
+                args=['sudo', 'adjust-ulimits', 'ceph-objectstore-tool'] + cmd,
+                wait=True, check_status=False,
+                stdout=StringIO(),
+                stderr=StringIO())
+
+    def run_ceph_bluestore_tool(self, remote, osd, cmd):
+        if self.ceph_manager.cephadm:
+            return shell(
+                self.ceph_manager.ctx, self.ceph_manager.cluster, remote,
+                args=['ceph-bluestore-tool', '--err-to-stderr'] + cmd,
+                name=osd,
+                wait=True, check_status=False,
+                stdout=StringIO(),
+                stderr=StringIO())
+        elif self.ceph_manager.rook:
+            assert False, 'not implemented'
+        else:
+            return remote.run(
+                args=['sudo', 'ceph-bluestore-tool', '--err-to-stderr'] + cmd,
+                wait=True, check_status=False,
+                stdout=StringIO(),
+                stderr=StringIO())
+
+    def kill_osd(self, osd=None, mark_down=False, mark_out=False):
+        """
+        :param osd: Osd to be killed.
+        :mark_down: Mark down if true.
+        :mark_out: Mark out if true.
+        """
+        if osd is None:
+            osd = random.choice(self.live_osds)
+        self.log("Killing osd %s, live_osds are %s" % (str(osd),
+                                                       str(self.live_osds)))
+        self.live_osds.remove(osd)
+        self.dead_osds.append(osd)
+        self.ceph_manager.kill_osd(osd)
+        if mark_down:
+            self.ceph_manager.mark_down_osd(osd)
+        if mark_out and osd in self.in_osds:
+            self.out_osd(osd)
+        if self.ceph_objectstore_tool:
+            self.log("Testing ceph-objectstore-tool on down osd.%s" % osd)
+            remote = self.ceph_manager.find_remote('osd', osd)
+            FSPATH = self.ceph_manager.get_filepath()
+            JPATH = os.path.join(FSPATH, "journal")
+            exp_osd = imp_osd = osd
+            self.log('remote for osd %s is %s' % (osd, remote))
+            exp_remote = imp_remote = remote
+            # If an older osd is available we'll move a pg from there
+            if (len(self.dead_osds) > 1 and
+                    random.random() < self.chance_move_pg):
+                exp_osd = random.choice(self.dead_osds[:-1])
+                exp_remote = self.ceph_manager.find_remote('osd', exp_osd)
+                self.log('remote for exp osd %s is %s' % (exp_osd, exp_remote))
+            prefix = [
+                '--no-mon-config',
+                '--log-file=/var/log/ceph/objectstore_tool.$pid.log',
+            ]
+
+            if self.ceph_manager.rook:
+                assert False, 'not implemented'
+
+            if not self.ceph_manager.cephadm:
+                # ceph-objectstore-tool might be temporarily absent during an
+                # upgrade - see http://tracker.ceph.com/issues/18014
+                with safe_while(sleep=15, tries=40, action="type ceph-objectstore-tool") as proceed:
+                    while proceed():
+                        proc = exp_remote.run(args=['type', 'ceph-objectstore-tool'],
+                                              wait=True, check_status=False, stdout=BytesIO(),
+                                              stderr=BytesIO())
+                        if proc.exitstatus == 0:
+                            break
+                        log.debug("ceph-objectstore-tool binary not present, trying again")
+
+            # ceph-objectstore-tool might bogusly fail with "OSD has the store locked"
+            # see http://tracker.ceph.com/issues/19556
+            with safe_while(sleep=15, tries=40, action="ceph-objectstore-tool --op list-pgs") as proceed:
+                while proceed():
+                    proc = self.run_ceph_objectstore_tool(
+                        exp_remote, 'osd.%s' % exp_osd,
+                        prefix + [
+                            '--data-path', FSPATH.format(id=exp_osd),
+                            '--journal-path', JPATH.format(id=exp_osd),
+                            '--op', 'list-pgs',
+                        ])
+                    if proc.exitstatus == 0:
+                        break
+                    elif (proc.exitstatus == 1 and
+                          proc.stderr.getvalue() == "OSD has the store locked"):
+                        continue
+                    else:
+                        raise Exception("ceph-objectstore-tool: "
+                                        "exp list-pgs failure with status {ret}".
+                                        format(ret=proc.exitstatus))
+
+            pgs = proc.stdout.getvalue().split('\n')[:-1]
+            if len(pgs) == 0:
+                self.log("No PGs found for osd.{osd}".format(osd=exp_osd))
+                return
+            pg = random.choice(pgs)
+            #exp_path = teuthology.get_testdir(self.ceph_manager.ctx)
+            #exp_path = os.path.join(exp_path, '{0}.data'.format(self.cluster))
+            exp_path = os.path.join('/var/log/ceph', # available inside 'shell' container
+                                    "exp.{pg}.{id}".format(
+                                        pg=pg,
+                                        id=exp_osd))
+            if self.ceph_manager.cephadm:
+                exp_host_path = os.path.join(
+                    '/var/log/ceph',
+                    self.ceph_manager.ctx.ceph[self.ceph_manager.cluster].fsid,
+                    "exp.{pg}.{id}".format(
+                        pg=pg,
+                        id=exp_osd))
+            else:
+                exp_host_path = exp_path
+
+            # export
+            # Can't use new export-remove op since this is part of upgrade testing
+            proc = self.run_ceph_objectstore_tool(
+                exp_remote, 'osd.%s' % exp_osd,
+                prefix + [
+                    '--data-path', FSPATH.format(id=exp_osd),
+                    '--journal-path', JPATH.format(id=exp_osd),
+                    '--op', 'export',
+                    '--pgid', pg,
+                    '--file', exp_path,
+                ])
+            if proc.exitstatus:
+                raise Exception("ceph-objectstore-tool: "
+                                "export failure with status {ret}".
+                                format(ret=proc.exitstatus))
+            # remove
+            proc = self.run_ceph_objectstore_tool(
+                exp_remote, 'osd.%s' % exp_osd,
+                prefix + [
+                    '--data-path', FSPATH.format(id=exp_osd),
+                    '--journal-path', JPATH.format(id=exp_osd),
+                    '--force',
+                    '--op', 'remove',
+                    '--pgid', pg,
+                ])
+            if proc.exitstatus:
+                raise Exception("ceph-objectstore-tool: "
+                                "remove failure with status {ret}".
+                                format(ret=proc.exitstatus))
+            # If there are at least 2 dead osds we might move the pg
+            if exp_osd != imp_osd:
+                # If pg isn't already on this osd, then we will move it there
+                proc = self.run_ceph_objectstore_tool(
+                    imp_remote,
+                    'osd.%s' % imp_osd,
+                    prefix + [
+                        '--data-path', FSPATH.format(id=imp_osd),
+                        '--journal-path', JPATH.format(id=imp_osd),
+                        '--op', 'list-pgs',
+                    ])
+                if proc.exitstatus:
+                    raise Exception("ceph-objectstore-tool: "
+                                    "imp list-pgs failure with status {ret}".
+                                    format(ret=proc.exitstatus))
+                pgs = proc.stdout.getvalue().split('\n')[:-1]
+                if pg not in pgs:
+                    self.log("Moving pg {pg} from osd.{fosd} to osd.{tosd}".
+                             format(pg=pg, fosd=exp_osd, tosd=imp_osd))
+                    if imp_remote != exp_remote:
+                        # Copy export file to the other machine
+                        self.log("Transfer export file from {srem} to {trem}".
+                                 format(srem=exp_remote, trem=imp_remote))
+                        # just in case an upgrade make /var/log/ceph unreadable by non-root,
+                        exp_remote.run(args=['sudo', 'chmod', '777',
+                                             '/var/log/ceph'])
+                        imp_remote.run(args=['sudo', 'chmod', '777',
+                                             '/var/log/ceph'])
+                        tmpexport = Remote.get_file(exp_remote, exp_host_path,
+                                                    sudo=True)
+                        if exp_host_path != exp_path:
+                            # push to /var/log/ceph, then rename (we can't
+                            # chmod 777 the /var/log/ceph/$fsid mountpoint)
+                            Remote.put_file(imp_remote, tmpexport, exp_path)
+                            imp_remote.run(args=[
+                                'sudo', 'mv', exp_path, exp_host_path])
+                        else:
+                            Remote.put_file(imp_remote, tmpexport, exp_host_path)
+                        os.remove(tmpexport)
+                else:
+                    # Can't move the pg after all
+                    imp_osd = exp_osd
+                    imp_remote = exp_remote
+            # import
+            proc = self.run_ceph_objectstore_tool(
+                imp_remote, 'osd.%s' % imp_osd,
+                [
+                    '--data-path', FSPATH.format(id=imp_osd),
+                    '--journal-path', JPATH.format(id=imp_osd),
+                    '--log-file=/var/log/ceph/objectstore_tool.$pid.log',
+                    '--op', 'import',
+                    '--file', exp_path,
+                ])
+            if proc.exitstatus == 1:
+                bogosity = "The OSD you are using is older than the exported PG"
+                if bogosity in proc.stderr.getvalue():
+                    self.log("OSD older than exported PG"
+                             "...ignored")
+            elif proc.exitstatus == 10:
+                self.log("Pool went away before processing an import"
+                         "...ignored")
+            elif proc.exitstatus == 11:
+                self.log("Attempt to import an incompatible export"
+                         "...ignored")
+            elif proc.exitstatus == 12:
+                # this should be safe to ignore because we only ever move 1
+                # copy of the pg at a time, and merge is only initiated when
+                # all replicas are peered and happy.  /me crosses fingers
+                self.log("PG merged on target"
+                         "...ignored")
+            elif proc.exitstatus:
+                raise Exception("ceph-objectstore-tool: "
+                                "import failure with status {ret}".
+                                format(ret=proc.exitstatus))
+            cmd = "sudo rm -f {file}".format(file=exp_host_path)
+            exp_remote.run(args=cmd)
+            if imp_remote != exp_remote:
+                imp_remote.run(args=cmd)
+
+            # apply low split settings to each pool
+            if not self.ceph_manager.cephadm:
+                for pool in self.ceph_manager.list_pools():
+                    cmd = ("CEPH_ARGS='--filestore-merge-threshold 1 "
+                           "--filestore-split-multiple 1' sudo -E "
+                           + 'ceph-objectstore-tool '
+                           + ' '.join(prefix + [
+                               '--data-path', FSPATH.format(id=imp_osd),
+                               '--journal-path', JPATH.format(id=imp_osd),
+                           ])
+                           + " --op apply-layout-settings --pool " + pool).format(id=osd)
+                    proc = imp_remote.run(args=cmd,
+                                          wait=True, check_status=False,
+                                          stderr=StringIO())
+                    if 'Couldn\'t find pool' in proc.stderr.getvalue():
+                        continue
+                    if proc.exitstatus:
+                        raise Exception("ceph-objectstore-tool apply-layout-settings"
+                                        " failed with {status}".format(status=proc.exitstatus))
+
+
+    def blackhole_kill_osd(self, osd=None):
+        """
+        If all else fails, kill the osd.
+        :param osd: Osd to be killed.
+        """
+        if osd is None:
+            osd = random.choice(self.live_osds)
+        self.log("Blackholing and then killing osd %s, live_osds are %s" %
+                 (str(osd), str(self.live_osds)))
+        self.live_osds.remove(osd)
+        self.dead_osds.append(osd)
+        self.ceph_manager.blackhole_kill_osd(osd)
+
+    def revive_osd(self, osd=None, skip_admin_check=False):
+        """
+        Revive the osd.
+        :param osd: Osd to be revived.
+        """
+        if osd is None:
+            osd = random.choice(self.dead_osds)
+        self.log("Reviving osd %s" % (str(osd),))
+        self.ceph_manager.revive_osd(
+            osd,
+            self.revive_timeout,
+            skip_admin_check=skip_admin_check)
+        self.dead_osds.remove(osd)
+        self.live_osds.append(osd)
+        if self.random_eio > 0 and osd == self.rerrosd:
+            self.ceph_manager.set_config(self.rerrosd,
+                                         filestore_debug_random_read_err = self.random_eio)
+            self.ceph_manager.set_config(self.rerrosd,
+                                         bluestore_debug_random_read_err = self.random_eio)
+
+
+    def out_osd(self, osd=None):
+        """
+        Mark the osd out
+        :param osd: Osd to be marked.
+        """
+        if osd is None:
+            osd = random.choice(self.in_osds)
+        self.log("Removing osd %s, in_osds are: %s" %
+                 (str(osd), str(self.in_osds)))
+        self.ceph_manager.mark_out_osd(osd)
+        self.in_osds.remove(osd)
+        self.out_osds.append(osd)
+
+    def in_osd(self, osd=None):
+        """
+        Mark the osd out
+        :param osd: Osd to be marked.
+        """
+        if osd is None:
+            osd = random.choice(self.out_osds)
+        if osd in self.dead_osds:
+            return self.revive_osd(osd)
+        self.log("Adding osd %s" % (str(osd),))
+        self.out_osds.remove(osd)
+        self.in_osds.append(osd)
+        self.ceph_manager.mark_in_osd(osd)
+        self.log("Added osd %s" % (str(osd),))
+
+    def reweight_osd_or_by_util(self, osd=None):
+        """
+        Reweight an osd that is in
+        :param osd: Osd to be marked.
+        """
+        if osd is not None or random.choice([True, False]):
+            if osd is None:
+                osd = random.choice(self.in_osds)
+            val = random.uniform(.1, 1.0)
+            self.log("Reweighting osd %s to %s" % (str(osd), str(val)))
+            self.ceph_manager.raw_cluster_cmd('osd', 'reweight',
+                                              str(osd), str(val))
+        else:
+            # do it several times, the option space is large
+            for i in range(5):
+                options = {
+                    'max_change': random.choice(['0.05', '1.0', '3.0']),
+                    'overage': random.choice(['110', '1000']),
+                    'type': random.choice([
+                        'reweight-by-utilization',
+                        'test-reweight-by-utilization']),
+                }
+                self.log("Reweighting by: %s"%(str(options),))
+                self.ceph_manager.raw_cluster_cmd(
+                    'osd',
+                    options['type'],
+                    options['overage'],
+                    options['max_change'])
+
+    def primary_affinity(self, osd=None):
+        self.log("primary_affinity")
+        if osd is None:
+            osd = random.choice(self.in_osds)
+        if random.random() >= .5:
+            pa = random.random()
+        elif random.random() >= .5:
+            pa = 1
+        else:
+            pa = 0
+        self.log('Setting osd %s primary_affinity to %f' % (str(osd), pa))
+        self.ceph_manager.raw_cluster_cmd('osd', 'primary-affinity',
+                                          str(osd), str(pa))
+
+    def thrash_cluster_full(self):
+        """
+        Set and unset cluster full condition
+        """
+        self.log('Setting full ratio to .001')
+        self.ceph_manager.raw_cluster_cmd('osd', 'set-full-ratio', '.001')
+        time.sleep(1)
+        self.log('Setting full ratio back to .95')
+        self.ceph_manager.raw_cluster_cmd('osd', 'set-full-ratio', '.95')
+
+    def thrash_pg_upmap(self):
+        """
+        Install or remove random pg_upmap entries in OSDMap
+        """
+        self.log("thrash_pg_upmap")
+        from random import shuffle
+        out = self.ceph_manager.raw_cluster_cmd('osd', 'dump', '-f', 'json-pretty')
+        j = json.loads(out)
+        self.log('j is %s' % j)
+        try:
+            if random.random() >= .3:
+                pgs = self.ceph_manager.get_pg_stats()
+                if not pgs:
+                    self.log('No pgs; doing nothing')
+                    return
+                pg = random.choice(pgs)
+                pgid = str(pg['pgid'])
+                poolid = int(pgid.split('.')[0])
+                sizes = [x['size'] for x in j['pools'] if x['pool'] == poolid]
+                if len(sizes) == 0:
+                    self.log('No pools; doing nothing')
+                    return
+                n = sizes[0]
+                osds = self.in_osds + self.out_osds
+                shuffle(osds)
+                osds = osds[0:n]
+                self.log('Setting %s to %s' % (pgid, osds))
+                cmd = ['osd', 'pg-upmap', pgid] + [str(x) for x in osds]
+                self.log('cmd %s' % cmd)
+                self.ceph_manager.raw_cluster_cmd(*cmd)
+            else:
+                m = j['pg_upmap']
+                if len(m) > 0:
+                    shuffle(m)
+                    pg = m[0]['pgid']
+                    self.log('Clearing pg_upmap on %s' % pg)
+                    self.ceph_manager.raw_cluster_cmd(
+                        'osd',
+                        'rm-pg-upmap',
+                        pg)
+                else:
+                    self.log('No pg_upmap entries; doing nothing')
+        except CommandFailedError:
+            self.log('Failed to rm-pg-upmap, ignoring')
+
+    def thrash_pg_upmap_items(self):
+        """
+        Install or remove random pg_upmap_items entries in OSDMap
+        """
+        self.log("thrash_pg_upmap_items")
+        from random import shuffle
+        out = self.ceph_manager.raw_cluster_cmd('osd', 'dump', '-f', 'json-pretty')
+        j = json.loads(out)
+        self.log('j is %s' % j)
+        try:
+            if random.random() >= .3:
+                pgs = self.ceph_manager.get_pg_stats()
+                if not pgs:
+                    self.log('No pgs; doing nothing')
+                    return
+                pg = random.choice(pgs)
+                pgid = str(pg['pgid'])
+                poolid = int(pgid.split('.')[0])
+                sizes = [x['size'] for x in j['pools'] if x['pool'] == poolid]
+                if len(sizes) == 0:
+                    self.log('No pools; doing nothing')
+                    return
+                n = sizes[0]
+                osds = self.in_osds + self.out_osds
+                shuffle(osds)
+                osds = osds[0:n*2]
+                self.log('Setting %s to %s' % (pgid, osds))
+                cmd = ['osd', 'pg-upmap-items', pgid] + [str(x) for x in osds]
+                self.log('cmd %s' % cmd)
+                self.ceph_manager.raw_cluster_cmd(*cmd)
+            else:
+                m = j['pg_upmap_items']
+                if len(m) > 0:
+                    shuffle(m)
+                    pg = m[0]['pgid']
+                    self.log('Clearing pg_upmap on %s' % pg)
+                    self.ceph_manager.raw_cluster_cmd(
+                        'osd',
+                        'rm-pg-upmap-items',
+                        pg)
+                else:
+                    self.log('No pg_upmap entries; doing nothing')
+        except CommandFailedError:
+            self.log('Failed to rm-pg-upmap-items, ignoring')
+
+    def force_recovery(self):
+        """
+        Force recovery on some of PGs
+        """
+        backfill = random.random() >= 0.5
+        j = self.ceph_manager.get_pgids_to_force(backfill)
+        if j:
+            try:
+                if backfill:
+                    self.ceph_manager.raw_cluster_cmd('pg', 'force-backfill', *j)
+                else:
+                    self.ceph_manager.raw_cluster_cmd('pg', 'force-recovery', *j)
+            except CommandFailedError:
+                self.log('Failed to force backfill|recovery, ignoring')
+
+
+    def cancel_force_recovery(self):
+        """
+        Force recovery on some of PGs
+        """
+        backfill = random.random() >= 0.5
+        j = self.ceph_manager.get_pgids_to_cancel_force(backfill)
+        if j:
+            try:
+                if backfill:
+                    self.ceph_manager.raw_cluster_cmd('pg', 'cancel-force-backfill', *j)
+                else:
+                    self.ceph_manager.raw_cluster_cmd('pg', 'cancel-force-recovery', *j)
+            except CommandFailedError:
+                self.log('Failed to force backfill|recovery, ignoring')
+
+    def force_cancel_recovery(self):
+        """
+        Force or cancel forcing recovery
+        """
+        if random.random() >= 0.4:
+           self.force_recovery()
+        else:
+           self.cancel_force_recovery()
+
+    def all_up(self):
+        """
+        Make sure all osds are up and not out.
+        """
+        while len(self.dead_osds) > 0:
+            self.log("reviving osd")
+            self.revive_osd()
+        while len(self.out_osds) > 0:
+            self.log("inning osd")
+            self.in_osd()
+
+    def all_up_in(self):
+        """
+        Make sure all osds are up and fully in.
+        """
+        self.all_up();
+        for osd in self.live_osds:
+            self.ceph_manager.raw_cluster_cmd('osd', 'reweight',
+                                              str(osd), str(1))
+            self.ceph_manager.raw_cluster_cmd('osd', 'primary-affinity',
+                                              str(osd), str(1))
+
+    def do_join(self):
+        """
+        Break out of this Ceph loop
+        """
+        self.stopping = True
+        self.thread.get()
+        if self.sighup_delay:
+            self.log("joining the do_sighup greenlet")
+            self.sighup_thread.get()
+        if self.optrack_toggle_delay:
+            self.log("joining the do_optrack_toggle greenlet")
+            self.optrack_toggle_thread.join()
+        if self.dump_ops_enable == "true":
+            self.log("joining the do_dump_ops greenlet")
+            self.dump_ops_thread.join()
+        if self.noscrub_toggle_delay:
+            self.log("joining the do_noscrub_toggle greenlet")
+            self.noscrub_toggle_thread.join()
+
+    def grow_pool(self):
+        """
+        Increase the size of the pool
+        """
+        pool = self.ceph_manager.get_pool()
+        if pool is None:
+            return
+        self.log("Growing pool %s" % (pool,))
+        if self.ceph_manager.expand_pool(pool,
+                                         self.config.get('pool_grow_by', 10),
+                                         self.max_pgs):
+            self.pools_to_fix_pgp_num.add(pool)
+
+    def shrink_pool(self):
+        """
+        Decrease the size of the pool
+        """
+        pool = self.ceph_manager.get_pool()
+        if pool is None:
+            return
+        _ = self.ceph_manager.get_pool_pg_num(pool)
+        self.log("Shrinking pool %s" % (pool,))
+        if self.ceph_manager.contract_pool(
+                pool,
+                self.config.get('pool_shrink_by', 10),
+                self.min_pgs):
+            self.pools_to_fix_pgp_num.add(pool)
+
+    def fix_pgp_num(self, pool=None):
+        """
+        Fix number of pgs in pool.
+        """
+        if pool is None:
+            pool = self.ceph_manager.get_pool()
+            if not pool:
+                return
+            force = False
+        else:
+            force = True
+        self.log("fixing pg num pool %s" % (pool,))
+        if self.ceph_manager.set_pool_pgpnum(pool, force):
+            self.pools_to_fix_pgp_num.discard(pool)
+
+    def test_pool_min_size(self):
+        """
+        Loop to selectively push PGs below their min_size and test that recovery
+        still occurs.
+        """
+        self.log("test_pool_min_size")
+        self.all_up()
+        time.sleep(60) # buffer time for recovery to start.
+        self.ceph_manager.wait_for_recovery(
+            timeout=self.config.get('timeout')
+            )
+        minout = int(self.config.get("min_out", 1))
+        minlive = int(self.config.get("min_live", 2))
+        mindead = int(self.config.get("min_dead", 1))
+        self.log("doing min_size thrashing")
+        self.ceph_manager.wait_for_clean(timeout=180)
+        assert self.ceph_manager.is_clean(), \
+            'not clean before minsize thrashing starts'
+        while not self.stopping:
+            # look up k and m from all the pools on each loop, in case it
+            # changes as the cluster runs
+            k = 0
+            m = 99
+            has_pools = False
+            pools_json = self.ceph_manager.get_osd_dump_json()['pools']
+
+            for pool_json in pools_json:
+                pool = pool_json['pool_name']
+                has_pools = True
+                pool_type = pool_json['type']  # 1 for rep, 3 for ec
+                min_size = pool_json['min_size']
+                self.log("pool {pool} min_size is {min_size}".format(pool=pool,min_size=min_size))
+                try:
+                    ec_profile = self.ceph_manager.get_pool_property(pool, 'erasure_code_profile')
+                    if pool_type != PoolType.ERASURE_CODED:
+                        continue
+                    ec_profile = pool_json['erasure_code_profile']
+                    ec_profile_json = self.ceph_manager.raw_cluster_cmd(
+                        'osd',
+                        'erasure-code-profile',
+                        'get',
+                        ec_profile,
+                        '--format=json')
+                    ec_json = json.loads(ec_profile_json)
+                    local_k = int(ec_json['k'])
+                    local_m = int(ec_json['m'])
+                    self.log("pool {pool} local_k={k} local_m={m}".format(pool=pool,
+                                                                          k=local_k, m=local_m))
+                    if local_k > k:
+                        self.log("setting k={local_k} from previous {k}".format(local_k=local_k, k=k))
+                        k = local_k
+                    if local_m < m:
+                        self.log("setting m={local_m} from previous {m}".format(local_m=local_m, m=m))
+                        m = local_m
+                except CommandFailedError:
+                    self.log("failed to read erasure_code_profile. %s was likely removed", pool)
+                    continue
+
+            if has_pools :
+                self.log("using k={k}, m={m}".format(k=k,m=m))
+            else:
+                self.log("No pools yet, waiting")
+                time.sleep(5)
+                continue
+                
+            if minout > len(self.out_osds): # kill OSDs and mark out
+                self.log("forced to out an osd")
+                self.kill_osd(mark_out=True)
+                continue
+            elif mindead > len(self.dead_osds): # kill OSDs but force timeout
+                self.log("forced to kill an osd")
+                self.kill_osd()
+                continue
+            else: # make mostly-random choice to kill or revive OSDs
+                minup = max(minlive, k)
+                rand_val = random.uniform(0, 1)
+                self.log("choosing based on number of live OSDs and rand val {rand}".\
+                         format(rand=rand_val))
+                if len(self.live_osds) > minup+1 and rand_val < 0.5:
+                    # chose to knock out as many OSDs as we can w/out downing PGs
+                    
+                    most_killable = min(len(self.live_osds) - minup, m)
+                    self.log("chose to kill {n} OSDs".format(n=most_killable))
+                    for i in range(1, most_killable):
+                        self.kill_osd(mark_out=True)
+                    time.sleep(10)
+                    # try a few times since there might be a concurrent pool
+                    # creation or deletion
+                    with safe_while(
+                            sleep=25, tries=5,
+                            action='check for active or peered') as proceed:
+                        while proceed():
+                            if self.ceph_manager.all_active_or_peered():
+                                break
+                            self.log('not all PGs are active or peered')
+                else: # chose to revive OSDs, bring up a random fraction of the dead ones
+                    self.log("chose to revive osds")
+                    for i in range(1, int(rand_val * len(self.dead_osds))):
+                        self.revive_osd(i)
+
+            # let PGs repair themselves or our next knockout might kill one
+            self.ceph_manager.wait_for_clean(timeout=self.config.get('timeout'))
+ 
+        # / while not self.stopping
+        self.all_up_in()
+ 
+        self.ceph_manager.wait_for_recovery(
+            timeout=self.config.get('timeout')
+            )
+
+    def inject_pause(self, conf_key, duration, check_after, should_be_down):
+        """
+        Pause injection testing. Check for osd being down when finished.
+        """
+        the_one = random.choice(self.live_osds)
+        self.log("inject_pause on osd.{osd}".format(osd=the_one))
+        self.log(
+            "Testing {key} pause injection for duration {duration}".format(
+                key=conf_key,
+                duration=duration
+                ))
+        self.log(
+            "Checking after {after}, should_be_down={shouldbedown}".format(
+                after=check_after,
+                shouldbedown=should_be_down
+                ))
+        self.ceph_manager.set_config(the_one, **{conf_key: duration})
+        if not should_be_down:
+            return
+        time.sleep(check_after)
+        status = self.ceph_manager.get_osd_status()
+        assert the_one in status['down']
+        time.sleep(duration - check_after + 20)
+        status = self.ceph_manager.get_osd_status()
+        assert not the_one in status['down']
+
+    def test_backfill_full(self):
+        """
+        Test backfills stopping when the replica fills up.
+
+        First, use injectfull admin command to simulate a now full
+        osd by setting it to 0 on all of the OSDs.
+
+        Second, on a random subset, set
+        osd_debug_skip_full_check_in_backfill_reservation to force
+        the more complicated check in do_scan to be exercised.
+
+        Then, verify that all backfillings stop.
+        """
+        self.log("injecting backfill full")
+        for i in self.live_osds:
+            self.ceph_manager.set_config(
+                i,
+                osd_debug_skip_full_check_in_backfill_reservation=
+                random.choice(['false', 'true']))
+            self.ceph_manager.osd_admin_socket(i, command=['injectfull', 'backfillfull'],
+                                     check_status=True, timeout=30, stdout=DEVNULL)
+        for i in range(30):
+            status = self.ceph_manager.compile_pg_status()
+            if 'backfilling' not in status.keys():
+                break
+            self.log(
+                "waiting for {still_going} backfillings".format(
+                    still_going=status.get('backfilling')))
+            time.sleep(1)
+        assert('backfilling' not in self.ceph_manager.compile_pg_status().keys())
+        for i in self.live_osds:
+            self.ceph_manager.set_config(
+                i,
+                osd_debug_skip_full_check_in_backfill_reservation='false')
+            self.ceph_manager.osd_admin_socket(i, command=['injectfull', 'none'],
+                                     check_status=True, timeout=30, stdout=DEVNULL)
+
+
+    def generate_random_sharding(self):
+        prefixes = [
+            'm','O','P','L'
+        ]
+        new_sharding = ''
+        for prefix in prefixes:
+            choose = random.choice([False, True])
+            if not choose:
+                continue
+            if new_sharding != '':
+                new_sharding = new_sharding + ' '
+            columns = random.randint(1, 5)
+            do_hash = random.choice([False, True])
+            if do_hash:
+                low_hash = random.choice([0, 5, 8])
+                do_high_hash = random.choice([False, True])
+                if do_high_hash:
+                    high_hash = random.choice([8, 16, 30]) + low_hash
+                    new_sharding = new_sharding + prefix + '(' + str(columns) + ',' + str(low_hash) + '-' + str(high_hash) + ')'
+                else:
+                    new_sharding = new_sharding + prefix + '(' + str(columns) + ',' + str(low_hash) + '-)'
+            else:
+                if columns == 1:
+                    new_sharding = new_sharding + prefix
+                else:
+                    new_sharding = new_sharding + prefix + '(' + str(columns) + ')'
+        return new_sharding
+
+    def test_bluestore_reshard_action(self):
+        """
+        Test if resharding of bluestore works properly.
+        If bluestore is not used, or bluestore is in version that
+        does not support sharding, skip.
+        """
+
+        osd = random.choice(self.dead_osds)
+        remote = self.ceph_manager.find_remote('osd', osd)
+        FSPATH = self.ceph_manager.get_filepath()
+
+        prefix = [
+                '--no-mon-config',
+                '--log-file=/var/log/ceph/bluestore_tool.$pid.log',
+                '--log-level=10',
+                '--path', FSPATH.format(id=osd)
+            ]
+
+        # sanity check if bluestore-tool accessible
+        self.log('checking if target objectstore is bluestore on osd.%s' % osd)
+        cmd = prefix + [
+            'show-label'
+            ]
+        proc = self.run_ceph_bluestore_tool(remote, 'osd.%s' % osd, cmd)
+        if proc.exitstatus != 0:
+            raise Exception("ceph-bluestore-tool access failed.")
+
+        # check if sharding is possible
+        self.log('checking if target bluestore supports sharding on osd.%s' % osd)
+        cmd = prefix + [
+            'show-sharding'
+            ]
+        proc = self.run_ceph_bluestore_tool(remote, 'osd.%s' % osd, cmd)
+        if proc.exitstatus != 0:
+            self.log("Unable to test resharding, "
+                     "ceph-bluestore-tool does not support it.")
+            return
+
+        # now go for reshard to something else
+        self.log('applying new sharding to bluestore on osd.%s' % osd)
+        new_sharding = self.config.get('bluestore_new_sharding','random')
+
+        if new_sharding == 'random':
+            self.log('generate random sharding')
+            new_sharding = self.generate_random_sharding()
+
+        self.log("applying new sharding: " + new_sharding)
+        cmd = prefix + [
+            '--sharding', new_sharding,
+            'reshard'
+            ]
+        proc = self.run_ceph_bluestore_tool(remote, 'osd.%s' % osd, cmd)
+        if proc.exitstatus != 0:
+            raise Exception("ceph-bluestore-tool resharding failed.")
+
+        # now do fsck to
+        self.log('running fsck to verify new sharding on osd.%s' % osd)
+        cmd = prefix + [
+            'fsck'
+            ]
+        proc = self.run_ceph_bluestore_tool(remote, 'osd.%s' % osd, cmd)
+        if proc.exitstatus != 0:
+            raise Exception("ceph-bluestore-tool fsck failed.")
+        self.log('resharding successfully completed')
+
+    def test_bluestore_reshard(self):
+        """
+        1) kills an osd
+        2) reshards bluestore on killed osd
+        3) revives the osd
+        """
+        self.log('test_bluestore_reshard started')
+        self.kill_osd(mark_down=True, mark_out=True)
+        self.test_bluestore_reshard_action()
+        self.revive_osd()
+        self.log('test_bluestore_reshard completed')
+
+
+    def test_map_discontinuity(self):
+        """
+        1) Allows the osds to recover
+        2) kills an osd
+        3) allows the remaining osds to recover
+        4) waits for some time
+        5) revives the osd
+        This sequence should cause the revived osd to have to handle
+        a map gap since the mons would have trimmed
+        """
+        self.log("test_map_discontinuity")
+        while len(self.in_osds) < (self.minin + 1):
+            self.in_osd()
+        self.log("Waiting for recovery")
+        self.ceph_manager.wait_for_all_osds_up(
+            timeout=self.config.get('timeout')
+            )
+        # now we wait 20s for the pg status to change, if it takes longer,
+        # the test *should* fail!
+        time.sleep(20)
+        self.ceph_manager.wait_for_clean(
+            timeout=self.config.get('timeout')
+            )
+
+        # now we wait 20s for the backfill replicas to hear about the clean
+        time.sleep(20)
+        self.log("Recovered, killing an osd")
+        self.kill_osd(mark_down=True, mark_out=True)
+        self.log("Waiting for clean again")
+        self.ceph_manager.wait_for_clean(
+            timeout=self.config.get('timeout')
+            )
+        self.log("Waiting for trim")
+        time.sleep(int(self.config.get("map_discontinuity_sleep_time", 40)))
+        self.revive_osd()
+
+    def choose_action(self):
+        """
+        Random action selector.
+        """
+        chance_down = self.config.get('chance_down', 0.4)
+        _ = self.config.get('chance_test_min_size', 0)
+        chance_test_backfill_full = \
+            self.config.get('chance_test_backfill_full', 0)
+        if isinstance(chance_down, int):
+            chance_down = float(chance_down) / 100
+        minin = self.minin
+        minout = int(self.config.get("min_out", 0))
+        minlive = int(self.config.get("min_live", 2))
+        mindead = int(self.config.get("min_dead", 0))
+
+        self.log('choose_action: min_in %d min_out '
+                 '%d min_live %d min_dead %d '
+                 'chance_down %.2f' %
+                 (minin, minout, minlive, mindead, chance_down))
+        actions = []
+        if len(self.in_osds) > minin:
+            actions.append((self.out_osd, 1.0,))
+        if len(self.live_osds) > minlive and chance_down > 0:
+            actions.append((self.kill_osd, chance_down,))
+        if len(self.out_osds) > minout:
+            actions.append((self.in_osd, 1.7,))
+        if len(self.dead_osds) > mindead:
+            actions.append((self.revive_osd, 1.0,))
+        if self.config.get('thrash_primary_affinity', True):
+            actions.append((self.primary_affinity, 1.0,))
+        actions.append((self.reweight_osd_or_by_util,
+                        self.config.get('reweight_osd', .5),))
+        actions.append((self.grow_pool,
+                        self.config.get('chance_pgnum_grow', 0),))
+        actions.append((self.shrink_pool,
+                        self.config.get('chance_pgnum_shrink', 0),))
+        actions.append((self.fix_pgp_num,
+                        self.config.get('chance_pgpnum_fix', 0),))
+        actions.append((self.test_pool_min_size,
+                        self.config.get('chance_test_min_size', 0),))
+        actions.append((self.test_backfill_full,
+                        chance_test_backfill_full,))
+        if self.chance_thrash_cluster_full > 0:
+            actions.append((self.thrash_cluster_full, self.chance_thrash_cluster_full,))
+        if self.chance_thrash_pg_upmap > 0:
+            actions.append((self.thrash_pg_upmap, self.chance_thrash_pg_upmap,))
+        if self.chance_thrash_pg_upmap_items > 0:
+            actions.append((self.thrash_pg_upmap_items, self.chance_thrash_pg_upmap_items,))
+        if self.chance_force_recovery > 0:
+            actions.append((self.force_cancel_recovery, self.chance_force_recovery))
+
+        for key in ['heartbeat_inject_failure', 'filestore_inject_stall']:
+            for scenario in [
+                (lambda:
+                 self.inject_pause(key,
+                                   self.config.get('pause_short', 3),
+                                   0,
+                                   False),
+                 self.config.get('chance_inject_pause_short', 1),),
+                (lambda:
+                 self.inject_pause(key,
+                                   self.config.get('pause_long', 80),
+                                   self.config.get('pause_check_after', 70),
+                                   True),
+                 self.config.get('chance_inject_pause_long', 0),)]:
+                actions.append(scenario)
+
+        # only consider resharding if objectstore is bluestore
+        cluster_name = self.ceph_manager.cluster
+        cluster = self.ceph_manager.ctx.ceph[cluster_name]
+        if cluster.conf.get('osd', {}).get('osd objectstore', 'bluestore') == 'bluestore':
+            actions.append((self.test_bluestore_reshard,
+                            self.config.get('chance_bluestore_reshard', 0),))
+
+        total = sum([y for (x, y) in actions])
+        val = random.uniform(0, total)
+        for (action, prob) in actions:
+            if val < prob:
+                return action
+            val -= prob
+        return None
+
+    def do_thrash(self):
+        """
+        _do_thrash() wrapper.
+        """
+        try:
+            self._do_thrash()
+        except Exception as e:
+            # See _run exception comment for MDSThrasher
+            self.set_thrasher_exception(e)
+            self.logger.exception("exception:")
+            # Allow successful completion so gevent doesn't see an exception.
+            # The DaemonWatchdog will observe the error and tear down the test.
+
+    @log_exc
+    def do_sighup(self):
+        """
+        Loops and sends signal.SIGHUP to a random live osd.
+
+        Loop delay is controlled by the config value sighup_delay.
+        """
+        delay = float(self.sighup_delay)
+        self.log("starting do_sighup with a delay of {0}".format(delay))
+        while not self.stopping:
+            osd = random.choice(self.live_osds)
+            self.ceph_manager.signal_osd(osd, signal.SIGHUP, silent=True)
+            time.sleep(delay)
+
+    @log_exc
+    def do_optrack_toggle(self):
+        """
+        Loops and toggle op tracking to all osds.
+
+        Loop delay is controlled by the config value optrack_toggle_delay.
+        """
+        delay = float(self.optrack_toggle_delay)
+        osd_state = "true"
+        self.log("starting do_optrack_toggle with a delay of {0}".format(delay))
+        while not self.stopping:
+            if osd_state == "true":
+                osd_state = "false"
+            else:
+                osd_state = "true"
+            try:
+                self.ceph_manager.inject_args('osd', '*',
+                                              'osd_enable_op_tracker',
+                                              osd_state)
+            except CommandFailedError:
+                self.log('Failed to tell all osds, ignoring')
+            gevent.sleep(delay)
+
+    @log_exc
+    def do_dump_ops(self):
+        """
+        Loops and does op dumps on all osds
+        """
+        self.log("starting do_dump_ops")
+        while not self.stopping:
+            for osd in self.live_osds:
+                # Ignore errors because live_osds is in flux
+                self.ceph_manager.osd_admin_socket(osd, command=['dump_ops_in_flight'],
+                                     check_status=False, timeout=30, stdout=DEVNULL)
+                self.ceph_manager.osd_admin_socket(osd, command=['dump_blocked_ops'],
+                                     check_status=False, timeout=30, stdout=DEVNULL)
+                self.ceph_manager.osd_admin_socket(osd, command=['dump_historic_ops'],
+                                     check_status=False, timeout=30, stdout=DEVNULL)
+            gevent.sleep(0)
+
+    @log_exc
+    def do_noscrub_toggle(self):
+        """
+        Loops and toggle noscrub flags
+
+        Loop delay is controlled by the config value noscrub_toggle_delay.
+        """
+        delay = float(self.noscrub_toggle_delay)
+        scrub_state = "none"
+        self.log("starting do_noscrub_toggle with a delay of {0}".format(delay))
+        while not self.stopping:
+            if scrub_state == "none":
+                self.ceph_manager.raw_cluster_cmd('osd', 'set', 'noscrub')
+                scrub_state = "noscrub"
+            elif scrub_state == "noscrub":
+                self.ceph_manager.raw_cluster_cmd('osd', 'set', 'nodeep-scrub')
+                scrub_state = "both"
+            elif scrub_state == "both":
+                self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'noscrub')
+                scrub_state = "nodeep-scrub"
+            else:
+                self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'nodeep-scrub')
+                scrub_state = "none"
+            gevent.sleep(delay)
+        self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'noscrub')
+        self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'nodeep-scrub')
+
+    @log_exc
+    def _do_thrash(self):
+        """
+        Loop to select random actions to thrash ceph manager with.
+        """
+        cleanint = self.config.get("clean_interval", 60)
+        scrubint = self.config.get("scrub_interval", -1)
+        maxdead = self.config.get("max_dead", 0)
+        delay = self.config.get("op_delay", 5)
+        self.rerrosd = self.live_osds[0]
+        if self.random_eio > 0:
+            self.ceph_manager.inject_args('osd', self.rerrosd,
+                                          'filestore_debug_random_read_err',
+                                          self.random_eio)
+            self.ceph_manager.inject_args('osd', self.rerrosd,
+                                          'bluestore_debug_random_read_err',
+                                          self.random_eio)
+        self.log("starting do_thrash")
+        while not self.stopping:
+            to_log = [str(x) for x in ["in_osds: ", self.in_osds,
+                                       "out_osds: ", self.out_osds,
+                                       "dead_osds: ", self.dead_osds,
+                                       "live_osds: ", self.live_osds]]
+            self.log(" ".join(to_log))
+            if random.uniform(0, 1) < (float(delay) / cleanint):
+                while len(self.dead_osds) > maxdead:
+                    self.revive_osd()
+                for osd in self.in_osds:
+                    self.ceph_manager.raw_cluster_cmd('osd', 'reweight',
+                                                      str(osd), str(1))
+                if random.uniform(0, 1) < float(
+                        self.config.get('chance_test_map_discontinuity', 0)) \
+                        and len(self.live_osds) > 5: # avoid m=2,k=2 stall, w/ some buffer for crush being picky
+                    self.test_map_discontinuity()
+                else:
+                    self.ceph_manager.wait_for_recovery(
+                        timeout=self.config.get('timeout')
+                        )
+                time.sleep(self.clean_wait)
+                if scrubint > 0:
+                    if random.uniform(0, 1) < (float(delay) / scrubint):
+                        self.log('Scrubbing while thrashing being performed')
+                        Scrubber(self.ceph_manager, self.config)
+            self.choose_action()()
+            time.sleep(delay)
+        self.all_up()
+        if self.random_eio > 0:
+            self.ceph_manager.inject_args('osd', self.rerrosd,
+                                          'filestore_debug_random_read_err', '0.0')
+            self.ceph_manager.inject_args('osd', self.rerrosd,
+                                          'bluestore_debug_random_read_err', '0.0')
+        for pool in list(self.pools_to_fix_pgp_num):
+            if self.ceph_manager.get_pool_pg_num(pool) > 0:
+                self.fix_pgp_num(pool)
+        self.pools_to_fix_pgp_num.clear()
+        for service, opt, saved_value in self.saved_options:
+            self.ceph_manager.inject_args(service, '*', opt, saved_value)
+        self.saved_options = []
+        self.all_up_in()
+
+
+class ObjectStoreTool:
+
+    def __init__(self, manager, pool, **kwargs):
+        self.manager = manager
+        self.pool = pool
+        self.osd = kwargs.get('osd', None)
+        self.object_name = kwargs.get('object_name', None)
+        self.do_revive = kwargs.get('do_revive', True)
+        if self.osd and self.pool and self.object_name:
+            if self.osd == "primary":
+                self.osd = self.manager.get_object_primary(self.pool,
+                                                           self.object_name)
+        assert self.osd is not None
+        if self.object_name:
+            self.pgid = self.manager.get_object_pg_with_shard(self.pool,
+                                                              self.object_name,
+                                                              self.osd)
+        self.remote = next(iter(self.manager.ctx.\
+            cluster.only('osd.{o}'.format(o=self.osd)).remotes.keys()))
+        path = self.manager.get_filepath().format(id=self.osd)
+        self.paths = ("--data-path {path} --journal-path {path}/journal".
+                      format(path=path))
+
+    def build_cmd(self, options, args, stdin):
+        lines = []
+        if self.object_name:
+            lines.append("object=$(sudo adjust-ulimits ceph-objectstore-tool "
+                         "{paths} --pgid {pgid} --op list |"
+                         "grep '\"oid\":\"{name}\"')".
+                         format(paths=self.paths,
+                                pgid=self.pgid,
+                                name=self.object_name))
+            args = '"$object" ' + args
+            options += " --pgid {pgid}".format(pgid=self.pgid)
+        cmd = ("sudo adjust-ulimits ceph-objectstore-tool {paths} {options} {args}".
+               format(paths=self.paths,
+                      args=args,
+                      options=options))
+        if stdin:
+            cmd = ("echo {payload} | base64 --decode | {cmd}".
+                   format(payload=base64.encode(stdin),
+                          cmd=cmd))
+        lines.append(cmd)
+        return "\n".join(lines)
+
+    def run(self, options, args):
+        self.manager.kill_osd(self.osd)
+        cmd = self.build_cmd(options, args, None)
+        self.manager.log(cmd)
+        try:
+            proc = self.remote.run(args=['bash', '-e', '-x', '-c', cmd],
+                                   check_status=False,
+                                   stdout=BytesIO(),
+                                   stderr=BytesIO())
+            proc.wait()
+            if proc.exitstatus != 0:
+                self.manager.log("failed with " + str(proc.exitstatus))
+                error = proc.stdout.getvalue().decode()  + " " + \
+                        proc.stderr.getvalue().decode()
+                raise Exception(error)
+        finally:
+            if self.do_revive:
+                self.manager.revive_osd(self.osd)
+                self.manager.wait_till_osd_is_up(self.osd, 300)
+
+
+# XXX: this class has nothing to do with the Ceph daemon (ceph-mgr) of
+# the same name.
+class CephManager:
+    """
+    Ceph manager object.
+    Contains several local functions that form a bulk of this module.
+
+    :param controller: the remote machine where the Ceph commands should be
+                       executed
+    :param ctx: the cluster context
+    :param config: path to Ceph config file
+    :param logger: for logging messages
+    :param cluster: name of the Ceph cluster
+    """
+
+    def __init__(self, controller, ctx=None, config=None, logger=None,
+                 cluster='ceph', cephadm=False, rook=False) -> None:
+        self.lock = threading.RLock()
+        self.ctx = ctx
+        self.config = config
+        self.controller = controller
+        self.next_pool_id = 0
+        self.cluster = cluster
+        self.cephadm = cephadm
+        self.rook = rook
+        if (logger):
+            self.log = lambda x: logger.info(x)
+        else:
+            def tmp(x):
+                """
+                implement log behavior.
+                """
+                print(x)
+            self.log = tmp
+        if self.config is None:
+            self.config = dict()
+        pools = self.list_pools()
+        self.pools = {}
+        for pool in pools:
+            # we may race with a pool deletion; ignore failures here
+            try:
+                self.pools[pool] = self.get_pool_int_property(pool, 'pg_num')
+            except CommandFailedError:
+                self.log('Failed to get pg_num from pool %s, ignoring' % pool)
+
+    def ceph(self, cmd, **kwargs):
+        """
+        Simple Ceph admin command wrapper around run_cluster_cmd.
+        """
+
+        kwargs.pop('args', None)
+        args = shlex.split(cmd)
+        stdout = kwargs.pop('stdout', StringIO())
+        stderr = kwargs.pop('stderr', StringIO())
+        return self.run_cluster_cmd(args=args, stdout=stdout, stderr=stderr, **kwargs)
+
+    def run_cluster_cmd(self, **kwargs):
+        """
+        Run a Ceph command and return the object representing the process
+        for the command.
+
+        Accepts arguments same as that of teuthology.orchestra.run.run()
+        """
+        if self.cephadm:
+            return shell(self.ctx, self.cluster, self.controller,
+                         args=['ceph'] + list(kwargs['args']),
+                         stdout=StringIO(),
+                         check_status=kwargs.get('check_status', True))
+        if self.rook:
+            return toolbox(self.ctx, self.cluster,
+                           args=['ceph'] + list(kwargs['args']),
+                           stdout=StringIO(),
+                           check_status=kwargs.get('check_status', True))
+
+        testdir = teuthology.get_testdir(self.ctx)
+        prefix = ['sudo', 'adjust-ulimits', 'ceph-coverage',
+                  f'{testdir}/archive/coverage', 'timeout', '120', 'ceph',
+                  '--cluster', self.cluster]
+        kwargs['args'] = prefix + list(kwargs['args'])
+        return self.controller.run(**kwargs)
+
+    def raw_cluster_cmd(self, *args, **kwargs) -> str:
+        """
+        Start ceph on a raw cluster.  Return count
+        """
+        stdout = kwargs.pop('stdout', StringIO())
+        p = self.run_cluster_cmd(args=args, stdout=stdout, **kwargs)
+        return p.stdout.getvalue()
+
+    def raw_cluster_cmd_result(self, *args, **kwargs):
+        """
+        Start ceph on a cluster.  Return success or failure information.
+        """
+        kwargs['args'], kwargs['check_status'] = args, False
+        return self.run_cluster_cmd(**kwargs).exitstatus
+
+    def run_ceph_w(self, watch_channel=None):
+        """
+        Execute "ceph -w" in the background with stdout connected to a BytesIO,
+        and return the RemoteProcess.
+
+        :param watch_channel: Specifies the channel to be watched. This can be
+                              'cluster', 'audit', ...
+        :type watch_channel: str
+        """
+        args = ["sudo",
+                "daemon-helper",
+                "kill",
+                "ceph",
+                '--cluster',
+                self.cluster,
+                "-w"]
+        if watch_channel is not None:
+            args.append("--watch-channel")
+            args.append(watch_channel)
+        return self.controller.run(args=args, wait=False, stdout=StringIO(), stdin=run.PIPE)
+
+    def get_mon_socks(self):
+        """
+        Get monitor sockets.
+
+        :return socks: tuple of strings; strings are individual sockets.
+        """
+        from json import loads
+
+        output = loads(self.raw_cluster_cmd(['--format=json', 'mon', 'dump']))
+        socks = []
+        for mon in output['mons']:
+            for addrvec_mem in mon['public_addrs']['addrvec']:
+                socks.append(addrvec_mem['addr'])
+        return tuple(socks)
+
+    def get_msgrv1_mon_socks(self):
+        """
+        Get monitor sockets that use msgrv1 to operate.
+
+        :return socks: tuple of strings; strings are individual sockets.
+        """
+        from json import loads
+
+        output = loads(self.raw_cluster_cmd('--format=json', 'mon', 'dump'))
+        socks = []
+        for mon in output['mons']:
+            for addrvec_mem in mon['public_addrs']['addrvec']:
+                if addrvec_mem['type'] == 'v1':
+                    socks.append(addrvec_mem['addr'])
+        return tuple(socks)
+
+    def get_msgrv2_mon_socks(self):
+        """
+        Get monitor sockets that use msgrv2 to operate.
+
+        :return socks: tuple of strings; strings are individual sockets.
+        """
+        from json import loads
+
+        output = loads(self.raw_cluster_cmd('--format=json', 'mon', 'dump'))
+        socks = []
+        for mon in output['mons']:
+            for addrvec_mem in mon['public_addrs']['addrvec']:
+                if addrvec_mem['type'] == 'v2':
+                    socks.append(addrvec_mem['addr'])
+        return tuple(socks)
+
+    def flush_pg_stats(self, osds, no_wait=None, wait_for_mon=300):
+        """
+        Flush pg stats from a list of OSD ids, ensuring they are reflected
+        all the way to the monitor.  Luminous and later only.
+
+        :param osds: list of OSDs to flush
+        :param no_wait: list of OSDs not to wait for seq id. by default, we
+                        wait for all specified osds, but some of them could be
+                        moved out of osdmap, so we cannot get their updated
+                        stat seq from monitor anymore. in that case, you need
+                        to pass a blocklist.
+        :param wait_for_mon: wait for mon to be synced with mgr. 0 to disable
+                             it. (5 min by default)
+        """
+        seq = {osd: int(self.raw_cluster_cmd('tell', 'osd.%d' % osd, 'flush_pg_stats'))
+               for osd in osds}
+        if not wait_for_mon:
+            return
+        if no_wait is None:
+            no_wait = []
+        for osd, need in seq.items():
+            if osd in no_wait:
+                continue
+            got = 0
+            while wait_for_mon > 0:
+                got = int(self.raw_cluster_cmd('osd', 'last-stat-seq', 'osd.%d' % osd))
+                self.log('need seq {need} got {got} for osd.{osd}'.format(
+                    need=need, got=got, osd=osd))
+                if got >= need:
+                    break
+                A_WHILE = 1
+                time.sleep(A_WHILE)
+                wait_for_mon -= A_WHILE
+            else:
+                raise Exception('timed out waiting for mon to be updated with '
+                                'osd.{osd}: {got} < {need}'.
+                                format(osd=osd, got=got, need=need))
+
+    def flush_all_pg_stats(self):
+        self.flush_pg_stats(range(len(self.get_osd_dump())))
+
+    def do_rados(self, cmd, pool=None, namespace=None, remote=None, **kwargs):
+        """
+        Execute a remote rados command.
+        """
+        if remote is None:
+            remote = self.controller
+
+        testdir = teuthology.get_testdir(self.ctx)
+        pre = [
+            'adjust-ulimits',
+            'ceph-coverage',
+            '{tdir}/archive/coverage'.format(tdir=testdir),
+            'rados',
+            '--cluster',
+            self.cluster,
+            ]
+        if pool is not None:
+            pre += ['--pool', pool]
+        if namespace is not None:
+            pre += ['--namespace', namespace]
+        pre.extend(cmd)
+        proc = remote.run(
+            args=pre,
+            wait=True,
+            **kwargs
+            )
+        return proc
+
+    def rados_write_objects(self, pool, num_objects, size,
+                            timelimit, threads, cleanup=False):
+        """
+        Write rados objects
+        Threads not used yet.
+        """
+        args = [
+            '--num-objects', num_objects,
+            '-b', size,
+            'bench', timelimit,
+            'write'
+            ]
+        if not cleanup:
+            args.append('--no-cleanup')
+        return self.do_rados(map(str, args), pool=pool)
+
+    def do_put(self, pool, obj, fname, namespace=None):
+        """
+        Implement rados put operation
+        """
+        args = ['put', obj, fname]
+        return self.do_rados(
+            args,
+            check_status=False,
+            pool=pool,
+            namespace=namespace
+        ).exitstatus
+
+    def do_get(self, pool, obj, fname='/dev/null', namespace=None):
+        """
+        Implement rados get operation
+        """
+        args = ['get', obj, fname]
+        return self.do_rados(
+            args,
+            check_status=False,
+            pool=pool,
+            namespace=namespace,
+        ).exitstatus
+
+    def do_rm(self, pool, obj, namespace=None):
+        """
+        Implement rados rm operation
+        """
+        args = ['rm', obj]
+        return self.do_rados(
+            args,
+            check_status=False,
+            pool=pool,
+            namespace=namespace
+        ).exitstatus
+
+    def osd_admin_socket(self, osd_id, command, check_status=True, timeout=0, stdout=None):
+        if stdout is None:
+            stdout = StringIO()
+        return self.admin_socket('osd', osd_id, command, check_status, timeout, stdout)
+
+    def find_remote(self, service_type, service_id):
+        """
+        Get the Remote for the host where a particular service runs.
+
+        :param service_type: 'mds', 'osd', 'client'
+        :param service_id: The second part of a role, e.g. '0' for
+                           the role 'client.0'
+        :return: a Remote instance for the host where the
+                 requested role is placed
+        """
+        return get_remote(self.ctx, self.cluster,
+                          service_type, service_id)
+
+    def admin_socket(self, service_type, service_id,
+                     command, check_status=True, timeout=0, stdout=None):
+        """
+        Remotely start up ceph specifying the admin socket
+        :param command: a list of words to use as the command
+                        to the admin socket
+        """
+        if stdout is None:
+            stdout = StringIO()
+
+        remote = self.find_remote(service_type, service_id)
+
+        if self.cephadm:
+            return shell(
+                self.ctx, self.cluster, remote,
+                args=[
+                    'ceph', 'daemon', '%s.%s' % (service_type, service_id),
+                ] + command,
+                stdout=stdout,
+                wait=True,
+                check_status=check_status,
+            )
+        if self.rook:
+            assert False, 'not implemented'
+
+        testdir = teuthology.get_testdir(self.ctx)
+        args = [
+            'sudo',
+            'adjust-ulimits',
+            'ceph-coverage',
+            '{tdir}/archive/coverage'.format(tdir=testdir),
+            'timeout',
+            str(timeout),
+            'ceph',
+            '--cluster',
+            self.cluster,
+            '--admin-daemon',
+            '/var/run/ceph/{cluster}-{type}.{id}.asok'.format(
+                cluster=self.cluster,
+                type=service_type,
+                id=service_id),
+            ]
+        args.extend(command)
+        return remote.run(
+            args=args,
+            stdout=stdout,
+            wait=True,
+            check_status=check_status
+            )
+
+    def objectstore_tool(self, pool, options, args, **kwargs):
+        return ObjectStoreTool(self, pool, **kwargs).run(options, args)
+
+    def get_pgid(self, pool, pgnum):
+        """
+        :param pool: pool name
+        :param pgnum: pg number
+        :returns: a string representing this pg.
+        """
+        poolnum = self.get_pool_num(pool)
+        pg_str = "{poolnum}.{pgnum}".format(
+            poolnum=poolnum,
+            pgnum=pgnum)
+        return pg_str
+
+    def get_pg_replica(self, pool, pgnum):
+        """
+        get replica for pool, pgnum (e.g. (data, 0)->0
+        """
+        pg_str = self.get_pgid(pool, pgnum)
+        output = self.raw_cluster_cmd("pg", "map", pg_str, '--format=json')
+        j = json.loads('\n'.join(output.split('\n')[1:]))
+        return int(j['acting'][-1])
+        assert False
+
+    def wait_for_pg_stats(func):
+        # both osd_mon_report_interval and mgr_stats_period are 5 seconds
+        # by default, and take the faulty injection in ms into consideration,
+        # 12 seconds are more than enough
+        delays = [1, 1, 2, 3, 5, 8, 13, 0]
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            exc = None
+            for delay in delays:
+                try:
+                    return func(self, *args, **kwargs)
+                except AssertionError as e:
+                    time.sleep(delay)
+                    exc = e
+            raise exc
+        return wrapper
+
+    def get_pg_primary(self, pool, pgnum):
+        """
+        get primary for pool, pgnum (e.g. (data, 0)->0
+        """
+        pg_str = self.get_pgid(pool, pgnum)
+        output = self.raw_cluster_cmd("pg", "map", pg_str, '--format=json')
+        j = json.loads('\n'.join(output.split('\n')[1:]))
+        return int(j['acting'][0])
+        assert False
+
+    def get_pool_num(self, pool):
+        """
+        get number for pool (e.g., data -> 2)
+        """
+        return int(self.get_pool_dump(pool)['pool'])
+
+    def list_pools(self):
+        """
+        list all pool names
+        """
+        osd_dump = self.get_osd_dump_json()
+        self.log(osd_dump['pools'])
+        return [str(i['pool_name']) for i in osd_dump['pools']]
+
+    def clear_pools(self):
+        """
+        remove all pools
+        """
+        [self.remove_pool(i) for i in self.list_pools()]
+
+    def kick_recovery_wq(self, osdnum):
+        """
+        Run kick_recovery_wq on cluster.
+        """
+        return self.raw_cluster_cmd(
+            'tell', "osd.%d" % (int(osdnum),),
+            'debug',
+            'kick_recovery_wq',
+            '0')
+
+    def wait_run_admin_socket(self, service_type,
+                              service_id, args=['version'], timeout=75, stdout=None):
+        """
+        If osd_admin_socket call succeeds, return.  Otherwise wait
+        five seconds and try again.
+        """
+        if stdout is None:
+            stdout = StringIO()
+        tries = 0
+        while True:
+            proc = self.admin_socket(service_type, service_id,
+                                     args, check_status=False, stdout=stdout)
+            if proc.exitstatus == 0:
+                return proc
+            else:
+                tries += 1
+                if (tries * 5) > timeout:
+                    raise Exception('timed out waiting for admin_socket '
+                                    'to appear after {type}.{id} restart'.
+                                    format(type=service_type,
+                                           id=service_id))
+                self.log("waiting on admin_socket for {type}-{id}, "
+                         "{command}".format(type=service_type,
+                                            id=service_id,
+                                            command=args))
+                time.sleep(5)
+
+    def get_pool_dump(self, pool):
+        """
+        get the osd dump part of a pool
+        """
+        osd_dump = self.get_osd_dump_json()
+        for i in osd_dump['pools']:
+            if i['pool_name'] == pool:
+                return i
+        assert False
+
+    def get_config(self, service_type, service_id, name):
+        """
+        :param node: like 'mon.a'
+        :param name: the option name
+        """
+        proc = self.wait_run_admin_socket(service_type, service_id,
+                                          ['config', 'show'])
+        j = json.loads(proc.stdout.getvalue())
+        return j[name]
+
+    def inject_args(self, service_type, service_id, name, value):
+        whom = '{0}.{1}'.format(service_type, service_id)
+        if isinstance(value, bool):
+            value = 'true' if value else 'false'
+        opt_arg = '--{name}={value}'.format(name=name, value=value)
+        self.raw_cluster_cmd('--', 'tell', whom, 'injectargs', opt_arg)
+
+    def set_config(self, osdnum, **argdict):
+        """
+        :param osdnum: osd number
+        :param argdict: dictionary containing values to set.
+        """
+        for k, v in argdict.items():
+            self.wait_run_admin_socket(
+                'osd', osdnum,
+                ['config', 'set', str(k), str(v)])
+
+    def raw_cluster_status(self):
+        """
+        Get status from cluster
+        """
+        status = self.raw_cluster_cmd('status', '--format=json')
+        return json.loads(status)
+
+    def raw_osd_status(self):
+        """
+        Get osd status from cluster
+        """
+        return self.raw_cluster_cmd('osd', 'dump')
+
+    def get_osd_status(self):
+        """
+        Get osd statuses sorted by states that the osds are in.
+        """
+        osd_lines = list(filter(
+            lambda x: x.startswith('osd.') and (("up" in x) or ("down" in x)),
+            self.raw_osd_status().split('\n')))
+        self.log(osd_lines)
+        in_osds = [int(i[4:].split()[0])
+                   for i in filter(lambda x: " in " in x, osd_lines)]
+        out_osds = [int(i[4:].split()[0])
+                    for i in filter(lambda x: " out " in x, osd_lines)]
+        up_osds = [int(i[4:].split()[0])
+                   for i in filter(lambda x: " up " in x, osd_lines)]
+        down_osds = [int(i[4:].split()[0])
+                     for i in filter(lambda x: " down " in x, osd_lines)]
+        dead_osds = [int(x.id_)
+                     for x in filter(lambda x:
+                                     not x.running(),
+                                     self.ctx.daemons.
+                                     iter_daemons_of_role('osd', self.cluster))]
+        live_osds = [int(x.id_) for x in
+                     filter(lambda x:
+                            x.running(),
+                            self.ctx.daemons.iter_daemons_of_role('osd',
+                                                                  self.cluster))]
+        return {'in': in_osds, 'out': out_osds, 'up': up_osds,
+                'down': down_osds, 'dead': dead_osds, 'live': live_osds,
+                'raw': osd_lines}
+
+    def get_num_pgs(self):
+        """
+        Check cluster status for the number of pgs
+        """
+        status = self.raw_cluster_status()
+        self.log(status)
+        return status['pgmap']['num_pgs']
+
+    def create_erasure_code_profile(self, profile_name, profile):
+        """
+        Create an erasure code profile name that can be used as a parameter
+        when creating an erasure coded pool.
+        """
+        with self.lock:
+            args = cmd_erasure_code_profile(profile_name, profile)
+            self.raw_cluster_cmd(*args)
+
+    def create_pool_with_unique_name(self, pg_num=16,
+                                     erasure_code_profile_name=None,
+                                     min_size=None,
+                                     erasure_code_use_overwrites=False):
+        """
+        Create a pool named unique_pool_X where X is unique.
+        """
+        name = ""
+        with self.lock:
+            name = "unique_pool_%s" % (str(self.next_pool_id),)
+            self.next_pool_id += 1
+            self.create_pool(
+                name,
+                pg_num,
+                erasure_code_profile_name=erasure_code_profile_name,
+                min_size=min_size,
+                erasure_code_use_overwrites=erasure_code_use_overwrites)
+        return name
+
+    @contextlib.contextmanager
+    def pool(self, pool_name, pg_num=16, erasure_code_profile_name=None):
+        self.create_pool(pool_name, pg_num, erasure_code_profile_name)
+        yield
+        self.remove_pool(pool_name)
+
+    def create_pool(self, pool_name, pg_num=16,
+                    erasure_code_profile_name=None,
+                    min_size=None,
+                    erasure_code_use_overwrites=False):
+        """
+        Create a pool named from the pool_name parameter.
+        :param pool_name: name of the pool being created.
+        :param pg_num: initial number of pgs.
+        :param erasure_code_profile_name: if set and !None create an
+                                          erasure coded pool using the profile
+        :param erasure_code_use_overwrites: if true, allow overwrites
+        """
+        with self.lock:
+            assert isinstance(pool_name, str)
+            assert isinstance(pg_num, int)
+            assert pool_name not in self.pools
+            self.log("creating pool_name %s" % (pool_name,))
+            if erasure_code_profile_name:
+                self.raw_cluster_cmd('osd', 'pool', 'create',
+                                     pool_name, str(pg_num), str(pg_num),
+                                     'erasure', erasure_code_profile_name)
+            else:
+                self.raw_cluster_cmd('osd', 'pool', 'create',
+                                     pool_name, str(pg_num))
+            if min_size is not None:
+                self.raw_cluster_cmd(
+                    'osd', 'pool', 'set', pool_name,
+                    'min_size',
+                    str(min_size))
+            if erasure_code_use_overwrites:
+                self.raw_cluster_cmd(
+                    'osd', 'pool', 'set', pool_name,
+                    'allow_ec_overwrites',
+                    'true')
+            self.raw_cluster_cmd(
+                'osd', 'pool', 'application', 'enable',
+                pool_name, 'rados', '--yes-i-really-mean-it',
+                run.Raw('||'), 'true')
+            self.pools[pool_name] = pg_num
+        time.sleep(1)
+
+    def add_pool_snap(self, pool_name, snap_name):
+        """
+        Add pool snapshot
+        :param pool_name: name of pool to snapshot
+        :param snap_name: name of snapshot to take
+        """
+        self.raw_cluster_cmd('osd', 'pool', 'mksnap',
+                             str(pool_name), str(snap_name))
+
+    def remove_pool_snap(self, pool_name, snap_name):
+        """
+        Remove pool snapshot
+        :param pool_name: name of pool to snapshot
+        :param snap_name: name of snapshot to remove
+        """
+        self.raw_cluster_cmd('osd', 'pool', 'rmsnap',
+                             str(pool_name), str(snap_name))
+
+    def remove_pool(self, pool_name):
+        """
+        Remove the indicated pool
+        :param pool_name: Pool to be removed
+        """
+        with self.lock:
+            assert isinstance(pool_name, str)
+            assert pool_name in self.pools
+            self.log("removing pool_name %s" % (pool_name,))
+            del self.pools[pool_name]
+            self.raw_cluster_cmd('osd', 'pool', 'rm', pool_name, pool_name,
+                                 "--yes-i-really-really-mean-it")
+
+    def get_pool(self):
+        """
+        Pick a random pool
+        """
+        with self.lock:
+            if self.pools:
+                return random.sample(self.pools.keys(), 1)[0]
+
+    def get_pool_pg_num(self, pool_name):
+        """
+        Return the number of pgs in the pool specified.
+        """
+        with self.lock:
+            assert isinstance(pool_name, str)
+            if pool_name in self.pools:
+                return self.pools[pool_name]
+            return 0
+
+    def get_pool_property(self, pool_name, prop):
+        """
+        :param pool_name: pool
+        :param prop: property to be checked.
+        :returns: property as string
+        """
+        with self.lock:
+            assert isinstance(pool_name, str)
+            assert isinstance(prop, str)
+            output = self.raw_cluster_cmd(
+                'osd',
+                'pool',
+                'get',
+                pool_name,
+                prop)
+            return output.split()[1]
+
+    def get_pool_int_property(self, pool_name, prop):
+        return int(self.get_pool_property(pool_name, prop))
+
+    def set_pool_property(self, pool_name, prop, val):
+        """
+        :param pool_name: pool
+        :param prop: property to be set.
+        :param val: value to set.
+
+        This routine retries if set operation fails.
+        """
+        with self.lock:
+            assert isinstance(pool_name, str)
+            assert isinstance(prop, str)
+            assert isinstance(val, int)
+            tries = 0
+            while True:
+                r = self.raw_cluster_cmd_result(
+                    'osd',
+                    'pool',
+                    'set',
+                    pool_name,
+                    prop,
+                    str(val))
+                if r != 11:  # EAGAIN
+                    break
+                tries += 1
+                if tries > 50:
+                    raise Exception('timed out getting EAGAIN '
+                                    'when setting pool property %s %s = %s' %
+                                    (pool_name, prop, val))
+                self.log('got EAGAIN setting pool property, '
+                         'waiting a few seconds...')
+                time.sleep(2)
+
+    def expand_pool(self, pool_name, by, max_pgs):
+        """
+        Increase the number of pgs in a pool
+        """
+        with self.lock:
+            assert isinstance(pool_name, str)
+            assert isinstance(by, int)
+            assert pool_name in self.pools
+            if self.get_num_creating() > 0:
+                return False
+            if (self.pools[pool_name] + by) > max_pgs:
+                return False
+            self.log("increase pool size by %d" % (by,))
+            new_pg_num = self.pools[pool_name] + by
+            self.set_pool_property(pool_name, "pg_num", new_pg_num)
+            self.pools[pool_name] = new_pg_num
+            return True
+
+    def contract_pool(self, pool_name, by, min_pgs):
+        """
+        Decrease the number of pgs in a pool
+        """
+        with self.lock:
+            self.log('contract_pool %s by %s min %s' % (
+                     pool_name, str(by), str(min_pgs)))
+            assert isinstance(pool_name, str)
+            assert isinstance(by, int)
+            assert pool_name in self.pools
+            if self.get_num_creating() > 0:
+                self.log('too many creating')
+                return False
+            proj = self.pools[pool_name] - by
+            if proj < min_pgs:
+                self.log('would drop below min_pgs, proj %d, currently %d' % (proj,self.pools[pool_name],))
+                return False
+            self.log("decrease pool size by %d" % (by,))
+            new_pg_num = self.pools[pool_name] - by
+            self.set_pool_property(pool_name, "pg_num", new_pg_num)
+            self.pools[pool_name] = new_pg_num
+            return True
+
+    def stop_pg_num_changes(self):
+        """
+        Reset all pg_num_targets back to pg_num, canceling splits and merges
+        """
+        self.log('Canceling any pending splits or merges...')
+        osd_dump = self.get_osd_dump_json()
+        try:
+            for pool in osd_dump['pools']:
+                if pool['pg_num'] != pool['pg_num_target']:
+                    self.log('Setting pool %s (%d) pg_num %d -> %d' %
+                             (pool['pool_name'], pool['pool'],
+                              pool['pg_num_target'],
+                              pool['pg_num']))
+                    self.raw_cluster_cmd('osd', 'pool', 'set', pool['pool_name'],
+                                         'pg_num', str(pool['pg_num']))
+        except KeyError:
+            # we don't support pg_num_target before nautilus
+            pass
+
+    def set_pool_pgpnum(self, pool_name, force):
+        """
+        Set pgpnum property of pool_name pool.
+        """
+        with self.lock:
+            assert isinstance(pool_name, str)
+            assert pool_name in self.pools
+            if not force and self.get_num_creating() > 0:
+                return False
+            self.set_pool_property(pool_name, 'pgp_num', self.pools[pool_name])
+            return True
+
+    def list_pg_unfound(self, pgid):
+        """
+        return list of unfound pgs with the id specified
+        """
+        r = None
+        offset = {}
+        while True:
+            out = self.raw_cluster_cmd('--', 'pg', pgid, 'list_unfound',
+                                       json.dumps(offset))
+            j = json.loads(out)
+            if r is None:
+                r = j
+            else:
+                r['objects'].extend(j['objects'])
+            if not 'more' in j:
+                break
+            if j['more'] == 0:
+                break
+            offset = j['objects'][-1]['oid']
+        if 'more' in r:
+            del r['more']
+        return r
+
+    def get_pg_stats(self):
+        """
+        Dump the cluster and get pg stats
+        """
+        out = self.raw_cluster_cmd('pg', 'dump', '--format=json')
+        j = json.loads('\n'.join(out.split('\n')[1:]))
+        try:
+            return j['pg_map']['pg_stats']
+        except KeyError:
+            return j['pg_stats']
+
+    def get_osd_df(self, osdid):
+        """
+        Get the osd df stats
+        """
+        out = self.raw_cluster_cmd('osd', 'df', 'name', 'osd.{}'.format(osdid),
+                                   '--format=json')
+        j = json.loads('\n'.join(out.split('\n')[1:]))
+        return j['nodes'][0]
+
+    def get_pool_df(self, name):
+        """
+        Get the pool df stats
+        """
+        out = self.raw_cluster_cmd('df', 'detail', '--format=json')
+        j = json.loads('\n'.join(out.split('\n')[1:]))
+        return next((p['stats'] for p in j['pools'] if p['name'] == name),
+                    None)
+
+    def get_pgids_to_force(self, backfill):
+        """
+        Return the randomized list of PGs that can have their recovery/backfill forced
+        """
+        j = self.get_pg_stats();
+        pgids = []
+        if backfill:
+            wanted = ['degraded', 'backfilling', 'backfill_wait']
+        else:
+            wanted = ['recovering', 'degraded', 'recovery_wait']
+        for pg in j:
+            status = pg['state'].split('+')
+            for t in wanted:
+                if random.random() > 0.5 and not ('forced_backfill' in status or 'forced_recovery' in status) and t in status:
+                    pgids.append(pg['pgid'])
+                    break
+        return pgids
+
+    def get_pgids_to_cancel_force(self, backfill):
+       """
+       Return the randomized list of PGs whose recovery/backfill priority is forced
+       """
+       j = self.get_pg_stats();
+       pgids = []
+       if backfill:
+           wanted = 'forced_backfill'
+       else:
+           wanted = 'forced_recovery'
+       for pg in j:
+           status = pg['state'].split('+')
+           if wanted in status and random.random() > 0.5:
+               pgids.append(pg['pgid'])
+       return pgids
+
+    def compile_pg_status(self):
+        """
+        Return a histogram of pg state values
+        """
+        ret = {}
+        j = self.get_pg_stats()
+        for pg in j:
+            for status in pg['state'].split('+'):
+                if status not in ret:
+                    ret[status] = 0
+                ret[status] += 1
+        return ret
+
+    @wait_for_pg_stats # type: ignore
+    def with_pg_state(self, pool, pgnum, check):
+        pgstr = self.get_pgid(pool, pgnum)
+        stats = self.get_single_pg_stats(pgstr)
+        assert(check(stats['state']))
+
+    @wait_for_pg_stats # type: ignore
+    def with_pg(self, pool, pgnum, check):
+        pgstr = self.get_pgid(pool, pgnum)
+        stats = self.get_single_pg_stats(pgstr)
+        return check(stats)
+
+    def get_last_scrub_stamp(self, pool, pgnum):
+        """
+        Get the timestamp of the last scrub.
+        """
+        stats = self.get_single_pg_stats(self.get_pgid(pool, pgnum))
+        return stats["last_scrub_stamp"]
+
+    def do_pg_scrub(self, pool, pgnum, stype):
+        """
+        Scrub pg and wait for scrubbing to finish
+        """
+        init = self.get_last_scrub_stamp(pool, pgnum)
+        RESEND_TIMEOUT = 120    # Must be a multiple of SLEEP_TIME
+        FATAL_TIMEOUT = RESEND_TIMEOUT * 3
+        SLEEP_TIME = 10
+        timer = 0
+        while init == self.get_last_scrub_stamp(pool, pgnum):
+            assert timer < FATAL_TIMEOUT, "fatal timeout trying to " + stype
+            self.log("waiting for scrub type %s" % (stype,))
+            if (timer % RESEND_TIMEOUT) == 0:
+                self.raw_cluster_cmd('pg', stype, self.get_pgid(pool, pgnum))
+                # The first time in this loop is the actual request
+                if timer != 0 and stype == "repair":
+                    self.log("WARNING: Resubmitted a non-idempotent repair")
+            time.sleep(SLEEP_TIME)
+            timer += SLEEP_TIME
+
+    def wait_snap_trimming_complete(self, pool):
+        """
+        Wait for snap trimming on pool to end
+        """
+        POLL_PERIOD = 10
+        FATAL_TIMEOUT = 600
+        start = time.time()
+        poolnum = self.get_pool_num(pool)
+        poolnumstr = "%s." % (poolnum,)
+        while (True):
+            now = time.time()
+            if (now - start) > FATAL_TIMEOUT:
+                assert (now - start) < FATAL_TIMEOUT, \
+                    'failed to complete snap trimming before timeout'
+            all_stats = self.get_pg_stats()
+            trimming = False
+            for pg in all_stats:
+                if (poolnumstr in pg['pgid']) and ('snaptrim' in pg['state']):
+                    self.log("pg {pg} in trimming, state: {state}".format(
+                        pg=pg['pgid'],
+                        state=pg['state']))
+                    trimming = True
+            if not trimming:
+                break
+            self.log("{pool} still trimming, waiting".format(pool=pool))
+            time.sleep(POLL_PERIOD)
+
+    def get_single_pg_stats(self, pgid):
+        """
+        Return pg for the pgid specified.
+        """
+        all_stats = self.get_pg_stats()
+
+        for pg in all_stats:
+            if pg['pgid'] == pgid:
+                return pg
+
+        return None
+
+    def get_object_pg_with_shard(self, pool, name, osdid):
+        """
+        """
+        pool_dump = self.get_pool_dump(pool)
+        object_map = self.get_object_map(pool, name)
+        if pool_dump["type"] == PoolType.ERASURE_CODED:
+            shard = object_map['acting'].index(osdid)
+            return "{pgid}s{shard}".format(pgid=object_map['pgid'],
+                                           shard=shard)
+        else:
+            return object_map['pgid']
+
+    def get_object_primary(self, pool, name):
+        """
+        """
+        object_map = self.get_object_map(pool, name)
+        return object_map['acting_primary']
+
+    def get_object_map(self, pool, name):
+        """
+        osd map --format=json converted to a python object
+        :returns: the python object
+        """
+        out = self.raw_cluster_cmd('--format=json', 'osd', 'map', pool, name)
+        return json.loads('\n'.join(out.split('\n')[1:]))
+
+    def get_osd_dump_json(self):
+        """
+        osd dump --format=json converted to a python object
+        :returns: the python object
+        """
+        out = self.raw_cluster_cmd('osd', 'dump', '--format=json')
+        return json.loads('\n'.join(out.split('\n')[1:]))
+
+    def get_osd_dump(self):
+        """
+        Dump osds
+        :returns: all osds
+        """
+        return self.get_osd_dump_json()['osds']
+
+    def get_osd_metadata(self):
+        """
+        osd metadata --format=json converted to a python object
+        :returns: the python object containing osd metadata information
+        """
+        out = self.raw_cluster_cmd('osd', 'metadata', '--format=json')
+        return json.loads('\n'.join(out.split('\n')[1:]))
+
+    def get_mgr_dump(self):
+        out = self.raw_cluster_cmd('mgr', 'dump', '--format=json')
+        return json.loads(out)
+
+    def get_stuck_pgs(self, type_, threshold):
+        """
+        :returns: stuck pg information from the cluster
+        """
+        out = self.raw_cluster_cmd('pg', 'dump_stuck', type_, str(threshold),
+                                   '--format=json')
+        return json.loads(out).get('stuck_pg_stats',[])
+
+    def get_num_unfound_objects(self):
+        """
+        Check cluster status to get the number of unfound objects
+        """
+        status = self.raw_cluster_status()
+        self.log(status)
+        return status['pgmap'].get('unfound_objects', 0)
+
+    def get_num_creating(self):
+        """
+        Find the number of pgs in creating mode.
+        """
+        pgs = self.get_pg_stats()
+        num = 0
+        for pg in pgs:
+            if 'creating' in pg['state']:
+                num += 1
+        return num
+
+    def get_num_active_clean(self):
+        """
+        Find the number of active and clean pgs.
+        """
+        pgs = self.get_pg_stats()
+        return self._get_num_active_clean(pgs)
+
+    def _get_num_active_clean(self, pgs):
+        num = 0
+        for pg in pgs:
+            if (pg['state'].count('active') and
+                    pg['state'].count('clean') and
+                    not pg['state'].count('stale')):
+                num += 1
+        return num
+
+    def get_num_active_recovered(self):
+        """
+        Find the number of active and recovered pgs.
+        """
+        pgs = self.get_pg_stats()
+        return self._get_num_active_recovered(pgs)
+
+    def _get_num_active_recovered(self, pgs):
+        num = 0
+        for pg in pgs:
+            if (pg['state'].count('active') and
+                    not pg['state'].count('recover') and
+                    not pg['state'].count('backfilling') and
+                    not pg['state'].count('stale')):
+                num += 1
+        return num
+
+    def get_is_making_recovery_progress(self):
+        """
+        Return whether there is recovery progress discernable in the
+        raw cluster status
+        """
+        status = self.raw_cluster_status()
+        kps = status['pgmap'].get('recovering_keys_per_sec', 0)
+        bps = status['pgmap'].get('recovering_bytes_per_sec', 0)
+        ops = status['pgmap'].get('recovering_objects_per_sec', 0)
+        return kps > 0 or bps > 0 or ops > 0
+
+    def get_num_active(self):
+        """
+        Find the number of active pgs.
+        """
+        pgs = self.get_pg_stats()
+        return self._get_num_active(pgs)
+
+    def _get_num_active(self, pgs):
+        num = 0
+        for pg in pgs:
+            if pg['state'].count('active') and not pg['state'].count('stale'):
+                num += 1
+        return num
+
+    def get_num_down(self):
+        """
+        Find the number of pgs that are down.
+        """
+        pgs = self.get_pg_stats()
+        num = 0
+        for pg in pgs:
+            if ((pg['state'].count('down') and not
+                    pg['state'].count('stale')) or
+                (pg['state'].count('incomplete') and not
+                    pg['state'].count('stale'))):
+                num += 1
+        return num
+
+    def get_num_active_down(self):
+        """
+        Find the number of pgs that are either active or down.
+        """
+        pgs = self.get_pg_stats()
+        return self._get_num_active_down(pgs)
+
+    def _get_num_active_down(self, pgs):
+        num = 0
+        for pg in pgs:
+            if ((pg['state'].count('active') and not
+                    pg['state'].count('stale')) or
+                (pg['state'].count('down') and not
+                    pg['state'].count('stale')) or
+                (pg['state'].count('incomplete') and not
+                    pg['state'].count('stale'))):
+                num += 1
+        return num
+
+    def get_num_peered(self):
+        """
+        Find the number of PGs that are peered
+        """
+        pgs = self.get_pg_stats()
+        return self._get_num_peered(pgs)
+
+    def _get_num_peered(self, pgs):
+        num = 0
+        for pg in pgs:
+            if pg['state'].count('peered') and not pg['state'].count('stale'):
+                 num += 1
+        return num
+
+    def is_clean(self):
+        """
+        True if all pgs are clean
+        """
+        pgs = self.get_pg_stats()
+        if self._get_num_active_clean(pgs) == len(pgs):
+            return True
+        else:
+            self.dump_pgs_not_active_clean()
+            return False
+
+    def is_recovered(self):
+        """
+        True if all pgs have recovered
+        """
+        pgs = self.get_pg_stats()
+        return self._get_num_active_recovered(pgs) == len(pgs)
+
+    def is_active_or_down(self):
+        """
+        True if all pgs are active or down
+        """
+        pgs = self.get_pg_stats()
+        return self._get_num_active_down(pgs) == len(pgs)
+
+    def dump_pgs_not_active_clean(self):
+        """
+        Dumps all pgs that are not active+clean
+        """
+        pgs = self.get_pg_stats()
+        for pg in pgs:
+           if pg['state'] != 'active+clean':
+             self.log('PG %s is not active+clean' % pg['pgid'])
+             self.log(pg)
+
+    def dump_pgs_not_active_down(self):
+        """
+        Dumps all pgs that are not active or down
+        """
+        pgs = self.get_pg_stats()
+        for pg in pgs:
+           if 'active' not in pg['state'] and 'down' not in pg['state']:
+             self.log('PG %s is not active or down' % pg['pgid'])
+             self.log(pg)
+
+    def dump_pgs_not_active(self):
+        """
+        Dumps all pgs that are not active
+        """
+        pgs = self.get_pg_stats()
+        for pg in pgs:
+           if 'active' not in pg['state']:
+             self.log('PG %s is not active' % pg['pgid'])
+             self.log(pg)
+
+    def dump_pgs_not_active_peered(self, pgs):
+        for pg in pgs:
+            if (not pg['state'].count('active')) and (not pg['state'].count('peered')):
+                self.log('PG %s is not active or peered' % pg['pgid'])
+                self.log(pg)
+
+    def wait_for_clean(self, timeout=1200):
+        """
+        Returns true when all pgs are clean.
+        """
+        self.log("waiting for clean")
+        start = time.time()
+        num_active_clean = self.get_num_active_clean()
+        while not self.is_clean():
+            if timeout is not None:
+                if self.get_is_making_recovery_progress():
+                    self.log("making progress, resetting timeout")
+                    start = time.time()
+                else:
+                    self.log("no progress seen, keeping timeout for now")
+                    if time.time() - start >= timeout:
+                        self.log('dumping pgs not clean')
+                        self.dump_pgs_not_active_clean()
+                        assert time.time() - start < timeout, \
+                            'wait_for_clean: failed before timeout expired'
+            cur_active_clean = self.get_num_active_clean()
+            if cur_active_clean != num_active_clean:
+                start = time.time()
+                num_active_clean = cur_active_clean
+            time.sleep(3)
+        self.log("clean!")
+
+    def are_all_osds_up(self):
+        """
+        Returns true if all osds are up.
+        """
+        x = self.get_osd_dump()
+        return (len(x) == sum([(y['up'] > 0) for y in x]))
+
+    def wait_for_all_osds_up(self, timeout=None):
+        """
+        When this exits, either the timeout has expired, or all
+        osds are up.
+        """
+        self.log("waiting for all up")
+        start = time.time()
+        while not self.are_all_osds_up():
+            if timeout is not None:
+                assert time.time() - start < timeout, \
+                    'timeout expired in wait_for_all_osds_up'
+            time.sleep(3)
+        self.log("all up!")
+
+    def pool_exists(self, pool):
+        if pool in self.list_pools():
+            return True
+        return False
+
+    def wait_for_pool(self, pool, timeout=300):
+        """
+        Wait for a pool to exist
+        """
+        self.log('waiting for pool %s to exist' % pool)
+        start = time.time()
+        while not self.pool_exists(pool):
+            if timeout is not None:
+                assert time.time() - start < timeout, \
+                    'timeout expired in wait_for_pool'
+            time.sleep(3)
+
+    def wait_for_pools(self, pools):
+        for pool in pools:
+            self.wait_for_pool(pool)
+
+    def is_mgr_available(self):
+        x = self.get_mgr_dump()
+        return x.get('available', False)
+
+    def wait_for_mgr_available(self, timeout=None):
+        self.log("waiting for mgr available")
+        start = time.time()
+        while not self.is_mgr_available():
+            if timeout is not None:
+                assert time.time() - start < timeout, \
+                    'timeout expired in wait_for_mgr_available'
+            time.sleep(3)
+        self.log("mgr available!")
+
+    def wait_for_recovery(self, timeout=None):
+        """
+        Check peering. When this exists, we have recovered.
+        """
+        self.log("waiting for recovery to complete")
+        start = time.time()
+        num_active_recovered = self.get_num_active_recovered()
+        while not self.is_recovered():
+            now = time.time()
+            if timeout is not None:
+                if self.get_is_making_recovery_progress():
+                    self.log("making progress, resetting timeout")
+                    start = time.time()
+                else:
+                    self.log("no progress seen, keeping timeout for now")
+                    if now - start >= timeout:
+                        if self.is_recovered():
+                            break
+                        self.log('dumping pgs not recovered yet')
+                        self.dump_pgs_not_active_clean()
+                        assert now - start < timeout, \
+                            'wait_for_recovery: failed before timeout expired'
+            cur_active_recovered = self.get_num_active_recovered()
+            if cur_active_recovered != num_active_recovered:
+                start = time.time()
+                num_active_recovered = cur_active_recovered
+            time.sleep(3)
+        self.log("recovered!")
+
+    def wait_for_active(self, timeout=None):
+        """
+        Check peering. When this exists, we are definitely active
+        """
+        self.log("waiting for peering to complete")
+        start = time.time()
+        num_active = self.get_num_active()
+        while not self.is_active():
+            if timeout is not None:
+                if time.time() - start >= timeout:
+                    self.log('dumping pgs not active')
+                    self.dump_pgs_not_active()
+                    assert time.time() - start < timeout, \
+                        'wait_for_active: failed before timeout expired'
+            cur_active = self.get_num_active()
+            if cur_active != num_active:
+                start = time.time()
+                num_active = cur_active
+            time.sleep(3)
+        self.log("active!")
+
+    def wait_for_active_or_down(self, timeout=None):
+        """
+        Check peering. When this exists, we are definitely either
+        active or down
+        """
+        self.log("waiting for peering to complete or become blocked")
+        start = time.time()
+        num_active_down = self.get_num_active_down()
+        while not self.is_active_or_down():
+            if timeout is not None:
+                if time.time() - start >= timeout:
+                    self.log('dumping pgs not active or down')
+                    self.dump_pgs_not_active_down()
+                    assert time.time() - start < timeout, \
+                        'wait_for_active_or_down: failed before timeout expired'
+            cur_active_down = self.get_num_active_down()
+            if cur_active_down != num_active_down:
+                start = time.time()
+                num_active_down = cur_active_down
+            time.sleep(3)
+        self.log("active or down!")
+
+    def osd_is_up(self, osd):
+        """
+        Wrapper for osd check
+        """
+        osds = self.get_osd_dump()
+        return osds[osd]['up'] > 0
+
+    def wait_till_osd_is_up(self, osd, timeout=None):
+        """
+        Loop waiting for osd.
+        """
+        self.log('waiting for osd.%d to be up' % osd)
+        start = time.time()
+        while not self.osd_is_up(osd):
+            if timeout is not None:
+                assert time.time() - start < timeout, \
+                    'osd.%d failed to come up before timeout expired' % osd
+            time.sleep(3)
+        self.log('osd.%d is up' % osd)
+
+    def is_active(self):
+        """
+        Wrapper to check if all pgs are active
+        """
+        return self.get_num_active() == self.get_num_pgs()
+
+    def all_active_or_peered(self):
+        """
+        Wrapper to check if all PGs are active or peered
+        """
+        pgs = self.get_pg_stats()
+        if self._get_num_active(pgs) + self._get_num_peered(pgs) == len(pgs):
+            return True
+        else:
+            self.dump_pgs_not_active_peered(pgs)
+            return False
+
+    def wait_till_active(self, timeout=None):
+        """
+        Wait until all pgs are active.
+        """
+        self.log("waiting till active")
+        start = time.time()
+        while not self.is_active():
+            if timeout is not None:
+                if time.time() - start >= timeout:
+                    self.log('dumping pgs not active')
+                    self.dump_pgs_not_active()
+                    assert time.time() - start < timeout, \
+                        'wait_till_active: failed before timeout expired'
+            time.sleep(3)
+        self.log("active!")
+
+    def wait_till_pg_convergence(self, timeout=None):
+        start = time.time()
+        old_stats = None
+        active_osds = [osd['osd'] for osd in self.get_osd_dump()
+                       if osd['in'] and osd['up']]
+        while True:
+            # strictly speaking, no need to wait for mon. but due to the
+            # "ms inject socket failures" setting, the osdmap could be delayed,
+            # so mgr is likely to ignore the pg-stat messages with pgs serving
+            # newly created pools which is not yet known by mgr. so, to make sure
+            # the mgr is updated with the latest pg-stats, waiting for mon/mgr is
+            # necessary.
+            self.flush_pg_stats(active_osds)
+            new_stats = dict((stat['pgid'], stat['state'])
+                             for stat in self.get_pg_stats())
+            if old_stats == new_stats:
+                return old_stats
+            if timeout is not None:
+                assert time.time() - start < timeout, \
+                    'failed to reach convergence before %d secs' % timeout
+            old_stats = new_stats
+            # longer than mgr_stats_period
+            time.sleep(5 + 1)
+
+    def mark_out_osd(self, osd):
+        """
+        Wrapper to mark osd out.
+        """
+        self.raw_cluster_cmd('osd', 'out', str(osd))
+
+    def kill_osd(self, osd):
+        """
+        Kill osds by either power cycling (if indicated by the config)
+        or by stopping.
+        """
+        if self.config.get('powercycle'):
+            remote = self.find_remote('osd', osd)
+            self.log('kill_osd on osd.{o} '
+                     'doing powercycle of {s}'.format(o=osd, s=remote.name))
+            self._assert_ipmi(remote)
+            remote.console.power_off()
+        elif self.config.get('bdev_inject_crash') and self.config.get('bdev_inject_crash_probability'):
+            if random.uniform(0, 1) < self.config.get('bdev_inject_crash_probability', .5):
+                self.inject_args(
+                    'osd', osd,
+                    'bdev-inject-crash', self.config.get('bdev_inject_crash'))
+                try:
+                    self.ctx.daemons.get_daemon('osd', osd, self.cluster).wait()
+                except:
+                    pass
+                else:
+                    raise RuntimeError('osd.%s did not fail' % osd)
+            else:
+                self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop()
+        else:
+            self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop()
+
+    @staticmethod
+    def _assert_ipmi(remote):
+        assert remote.console.has_ipmi_credentials, (
+            "powercycling requested but RemoteConsole is not "
+            "initialized.  Check ipmi config.")
+
+    def blackhole_kill_osd(self, osd):
+        """
+        Stop osd if nothing else works.
+        """
+        self.inject_args('osd', osd,
+                         'objectstore-blackhole', True)
+        time.sleep(2)
+        self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop()
+
+    def revive_osd(self, osd, timeout=360, skip_admin_check=False):
+        """
+        Revive osds by either power cycling (if indicated by the config)
+        or by restarting.
+        """
+        if self.config.get('powercycle'):
+            remote = self.find_remote('osd', osd)
+            self.log('kill_osd on osd.{o} doing powercycle of {s}'.
+                     format(o=osd, s=remote.name))
+            self._assert_ipmi(remote)
+            remote.console.power_on()
+            if not remote.console.check_status(300):
+                raise Exception('Failed to revive osd.{o} via ipmi'.
+                                format(o=osd))
+            teuthology.reconnect(self.ctx, 60, [remote])
+            mount_osd_data(self.ctx, remote, self.cluster, str(osd))
+            self.make_admin_daemon_dir(remote)
+            self.ctx.daemons.get_daemon('osd', osd, self.cluster).reset()
+        self.ctx.daemons.get_daemon('osd', osd, self.cluster).restart()
+
+        if not skip_admin_check:
+            # wait for dump_ops_in_flight; this command doesn't appear
+            # until after the signal handler is installed and it is safe
+            # to stop the osd again without making valgrind leak checks
+            # unhappy.  see #5924.
+            self.wait_run_admin_socket('osd', osd,
+                                       args=['dump_ops_in_flight'],
+                                       timeout=timeout, stdout=DEVNULL)
+
+    def mark_down_osd(self, osd):
+        """
+        Cluster command wrapper
+        """
+        self.raw_cluster_cmd('osd', 'down', str(osd))
+
+    def mark_in_osd(self, osd):
+        """
+        Cluster command wrapper
+        """
+        self.raw_cluster_cmd('osd', 'in', str(osd))
+
+    def signal_osd(self, osd, sig, silent=False):
+        """
+        Wrapper to local get_daemon call which sends the given
+        signal to the given osd.
+        """
+        self.ctx.daemons.get_daemon('osd', osd,
+                                    self.cluster).signal(sig, silent=silent)
+
+    ## monitors
+    def signal_mon(self, mon, sig, silent=False):
+        """
+        Wrapper to local get_daemon call
+        """
+        self.ctx.daemons.get_daemon('mon', mon,
+                                    self.cluster).signal(sig, silent=silent)
+
+    def kill_mon(self, mon):
+        """
+        Kill the monitor by either power cycling (if the config says so),
+        or by doing a stop.
+        """
+        if self.config.get('powercycle'):
+            remote = self.find_remote('mon', mon)
+            self.log('kill_mon on mon.{m} doing powercycle of {s}'.
+                     format(m=mon, s=remote.name))
+            self._assert_ipmi(remote)
+            remote.console.power_off()
+        else:
+            self.ctx.daemons.get_daemon('mon', mon, self.cluster).stop()
+
+    def revive_mon(self, mon):
+        """
+        Restart by either power cycling (if the config says so),
+        or by doing a normal restart.
+        """
+        if self.config.get('powercycle'):
+            remote = self.find_remote('mon', mon)
+            self.log('revive_mon on mon.{m} doing powercycle of {s}'.
+                     format(m=mon, s=remote.name))
+            self._assert_ipmi(remote)
+            remote.console.power_on()
+            self.make_admin_daemon_dir(remote)
+        self.ctx.daemons.get_daemon('mon', mon, self.cluster).restart()
+
+    def revive_mgr(self, mgr):
+        """
+        Restart by either power cycling (if the config says so),
+        or by doing a normal restart.
+        """
+        if self.config.get('powercycle'):
+            remote = self.find_remote('mgr', mgr)
+            self.log('revive_mgr on mgr.{m} doing powercycle of {s}'.
+                     format(m=mgr, s=remote.name))
+            self._assert_ipmi(remote)
+            remote.console.power_on()
+            self.make_admin_daemon_dir(remote)
+        self.ctx.daemons.get_daemon('mgr', mgr, self.cluster).restart()
+
+    def get_mon_status(self, mon):
+        """
+        Extract all the monitor status information from the cluster
+        """
+        out = self.raw_cluster_cmd('tell', 'mon.%s' % mon, 'mon_status')
+        return json.loads(out)
+
+    def get_mon_quorum(self):
+        """
+        Extract monitor quorum information from the cluster
+        """
+        out = self.raw_cluster_cmd('quorum_status')
+        j = json.loads(out)
+        return j['quorum']
+
+    def wait_for_mon_quorum_size(self, size, timeout=300):
+        """
+        Loop until quorum size is reached.
+        """
+        self.log('waiting for quorum size %d' % size)
+        sleep = 3
+        with safe_while(sleep=sleep,
+                        tries=timeout // sleep,
+                        action=f'wait for quorum size {size}') as proceed:
+            while proceed():
+                try:
+                    if len(self.get_mon_quorum()) == size:
+                        break
+                except CommandFailedError as e:
+                    # could fail instea4d of blocked if the rotating key of the
+                    # connected monitor is not updated yet after they form the
+                    # quorum
+                    if e.exitstatus == errno.EACCES:
+                        pass
+                    else:
+                        raise
+        self.log("quorum is size %d" % size)
+
+    def get_mon_health(self, debug=False):
+        """
+        Extract all the monitor health information.
+        """
+        out = self.raw_cluster_cmd('health', '--format=json')
+        if debug:
+            self.log('health:\n{h}'.format(h=out))
+        return json.loads(out)
+
+    def wait_until_healthy(self, timeout=None):
+        self.log("wait_until_healthy")
+        start = time.time()
+        while self.get_mon_health()['status'] != 'HEALTH_OK':
+            if timeout is not None:
+                assert time.time() - start < timeout, \
+                    'timeout expired in wait_until_healthy'
+            time.sleep(3)
+        self.log("wait_until_healthy done")
+
+    def get_filepath(self):
+        """
+        Return path to osd data with {id} needing to be replaced
+        """
+        return '/var/lib/ceph/osd/' + self.cluster + '-{id}'
+
+    def make_admin_daemon_dir(self, remote):
+        """
+        Create /var/run/ceph directory on remote site.
+
+        :param ctx: Context
+        :param remote: Remote site
+        """
+        remote.run(args=['sudo',
+                         'install', '-d', '-m0777', '--', '/var/run/ceph', ], )
+
+    def get_service_task_status(self, service, status_key):
+        """
+        Return daemon task status for a given ceph service.
+
+        :param service: ceph service (mds, osd, etc...)
+        :param status_key: matching task status key
+        """
+        task_status = {}
+        status = self.raw_cluster_status()
+        try:
+            for k,v in status['servicemap']['services'][service]['daemons'].items():
+                ts = dict(v).get('task_status', None)
+                if ts:
+                    task_status[k] = ts[status_key]
+        except KeyError: # catches missing service and status key
+            return {}
+        self.log(task_status)
+        return task_status
+
+def utility_task(name):
+    """
+    Generate ceph_manager subtask corresponding to ceph_manager
+    method name
+    """
+    def task(ctx, config):
+        if config is None:
+            config = {}
+        args = config.get('args', [])
+        kwargs = config.get('kwargs', {})
+        cluster = config.get('cluster', 'ceph')
+        fn = getattr(ctx.managers[cluster], name)
+        fn(*args, **kwargs)
+    return task
+
+revive_osd = utility_task("revive_osd")
+revive_mon = utility_task("revive_mon")
+kill_osd = utility_task("kill_osd")
+kill_mon = utility_task("kill_mon")
+create_pool = utility_task("create_pool")
+remove_pool = utility_task("remove_pool")
+wait_for_clean = utility_task("wait_for_clean")
+flush_all_pg_stats = utility_task("flush_all_pg_stats")
+set_pool_property = utility_task("set_pool_property")
+do_pg_scrub = utility_task("do_pg_scrub")
+wait_for_pool = utility_task("wait_for_pool")
+wait_for_pools = utility_task("wait_for_pools")
diff --git a/qa/tasks/ceph_objectstore_tool.py b/qa/tasks/ceph_objectstore_tool.py
new file mode 100644
index 000000000..9c29d80b2
--- /dev/null
+++ b/qa/tasks/ceph_objectstore_tool.py
@@ -0,0 +1,662 @@
+"""
+ceph_objectstore_tool - Simple test of ceph-objectstore-tool utility
+"""
+from io import BytesIO
+
+import contextlib
+import json
+import logging
+import os
+import sys
+import tempfile
+import time
+from tasks import ceph_manager
+from tasks.util.rados import (rados, create_replicated_pool, create_ec_pool)
+from teuthology import misc as teuthology
+from teuthology.orchestra import run
+
+from teuthology.exceptions import CommandFailedError
+
+# from util.rados import (rados, create_ec_pool,
+#                               create_replicated_pool,
+#                               create_cache_pool)
+
+log = logging.getLogger(__name__)
+
+# Should get cluster name "ceph" from somewhere
+# and normal path from osd_data and osd_journal in conf
+FSPATH = "/var/lib/ceph/osd/ceph-{id}"
+JPATH = "/var/lib/ceph/osd/ceph-{id}/journal"
+
+
+def cod_setup_local_data(log, ctx, NUM_OBJECTS, DATADIR,
+                         BASE_NAME, DATALINECOUNT):
+    objects = range(1, NUM_OBJECTS + 1)
+    for i in objects:
+        NAME = BASE_NAME + "{num}".format(num=i)
+        LOCALNAME = os.path.join(DATADIR, NAME)
+
+        dataline = range(DATALINECOUNT)
+        fd = open(LOCALNAME, "w")
+        data = "This is the data for " + NAME + "\n"
+        for _ in dataline:
+            fd.write(data)
+        fd.close()
+
+
+def cod_setup_remote_data(log, ctx, remote, NUM_OBJECTS, DATADIR,
+                          BASE_NAME, DATALINECOUNT):
+
+    objects = range(1, NUM_OBJECTS + 1)
+    for i in objects:
+        NAME = BASE_NAME + "{num}".format(num=i)
+        DDNAME = os.path.join(DATADIR, NAME)
+
+        remote.run(args=['rm', '-f', DDNAME])
+
+        dataline = range(DATALINECOUNT)
+        data = "This is the data for " + NAME + "\n"
+        DATA = ""
+        for _ in dataline:
+            DATA += data
+        remote.write_file(DDNAME, DATA)
+
+
+def cod_setup(log, ctx, remote, NUM_OBJECTS, DATADIR,
+              BASE_NAME, DATALINECOUNT, POOL, db, ec):
+    ERRORS = 0
+    log.info("Creating {objs} objects in pool".format(objs=NUM_OBJECTS))
+
+    objects = range(1, NUM_OBJECTS + 1)
+    for i in objects:
+        NAME = BASE_NAME + "{num}".format(num=i)
+        DDNAME = os.path.join(DATADIR, NAME)
+
+        proc = rados(ctx, remote, ['-p', POOL, 'put', NAME, DDNAME],
+                     wait=False)
+        # proc = remote.run(args=['rados', '-p', POOL, 'put', NAME, DDNAME])
+        ret = proc.wait()
+        if ret != 0:
+            log.critical("Rados put failed with status {ret}".
+                         format(ret=proc.exitstatus))
+            sys.exit(1)
+
+        db[NAME] = {}
+
+        keys = range(i)
+        db[NAME]["xattr"] = {}
+        for k in keys:
+            if k == 0:
+                continue
+            mykey = "key{i}-{k}".format(i=i, k=k)
+            myval = "val{i}-{k}".format(i=i, k=k)
+            proc = remote.run(args=['rados', '-p', POOL, 'setxattr',
+                                    NAME, mykey, myval])
+            ret = proc.wait()
+            if ret != 0:
+                log.error("setxattr failed with {ret}".format(ret=ret))
+                ERRORS += 1
+            db[NAME]["xattr"][mykey] = myval
+
+        # Erasure coded pools don't support omap
+        if ec:
+            continue
+
+        # Create omap header in all objects but REPobject1
+        if i != 1:
+            myhdr = "hdr{i}".format(i=i)
+            proc = remote.run(args=['rados', '-p', POOL, 'setomapheader',
+                                    NAME, myhdr])
+            ret = proc.wait()
+            if ret != 0:
+                log.critical("setomapheader failed with {ret}".format(ret=ret))
+                ERRORS += 1
+            db[NAME]["omapheader"] = myhdr
+
+        db[NAME]["omap"] = {}
+        for k in keys:
+            if k == 0:
+                continue
+            mykey = "okey{i}-{k}".format(i=i, k=k)
+            myval = "oval{i}-{k}".format(i=i, k=k)
+            proc = remote.run(args=['rados', '-p', POOL, 'setomapval',
+                                    NAME, mykey, myval])
+            ret = proc.wait()
+            if ret != 0:
+                log.critical("setomapval failed with {ret}".format(ret=ret))
+            db[NAME]["omap"][mykey] = myval
+
+    return ERRORS
+
+
+def get_lines(filename):
+    tmpfd = open(filename, "r")
+    line = True
+    lines = []
+    while line:
+        line = tmpfd.readline().rstrip('\n')
+        if line:
+            lines += [line]
+    tmpfd.close()
+    os.unlink(filename)
+    return lines
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run ceph_objectstore_tool test
+
+    The config should be as follows::
+
+        ceph_objectstore_tool:
+          objects: 20 # <number of objects>
+          pgnum: 12
+    """
+
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'ceph_objectstore_tool task only accepts a dict for configuration'
+
+    log.info('Beginning ceph_objectstore_tool...')
+
+    log.debug(config)
+    log.debug(ctx)
+    clients = ctx.cluster.only(teuthology.is_type('client'))
+    assert len(clients.remotes) > 0, 'Must specify at least 1 client'
+    (cli_remote, _) = clients.remotes.popitem()
+    log.debug(cli_remote)
+
+    # clients = dict(teuthology.get_clients(ctx=ctx, roles=config.keys()))
+    # client = clients.popitem()
+    # log.info(client)
+    osds = ctx.cluster.only(teuthology.is_type('osd'))
+    log.info("OSDS")
+    log.info(osds)
+    log.info(osds.remotes)
+
+    manager = ctx.managers['ceph']
+    while (len(manager.get_osd_status()['up']) !=
+           len(manager.get_osd_status()['raw'])):
+        time.sleep(10)
+    while (len(manager.get_osd_status()['in']) !=
+           len(manager.get_osd_status()['up'])):
+        time.sleep(10)
+    manager.raw_cluster_cmd('osd', 'set', 'noout')
+    manager.raw_cluster_cmd('osd', 'set', 'nodown')
+
+    PGNUM = config.get('pgnum', 12)
+    log.info("pgnum: {num}".format(num=PGNUM))
+
+    ERRORS = 0
+
+    REP_POOL = "rep_pool"
+    REP_NAME = "REPobject"
+    create_replicated_pool(cli_remote, REP_POOL, PGNUM)
+    ERRORS += test_objectstore(ctx, config, cli_remote, REP_POOL, REP_NAME)
+
+    EC_POOL = "ec_pool"
+    EC_NAME = "ECobject"
+    create_ec_pool(cli_remote, EC_POOL, 'default', PGNUM)
+    ERRORS += test_objectstore(ctx, config, cli_remote,
+                               EC_POOL, EC_NAME, ec=True)
+
+    if ERRORS == 0:
+        log.info("TEST PASSED")
+    else:
+        log.error("TEST FAILED WITH {errcount} ERRORS".format(errcount=ERRORS))
+
+    assert ERRORS == 0
+
+    try:
+        yield
+    finally:
+        log.info('Ending ceph_objectstore_tool')
+
+
+def test_objectstore(ctx, config, cli_remote, REP_POOL, REP_NAME, ec=False):
+    manager = ctx.managers['ceph']
+
+    osds = ctx.cluster.only(teuthology.is_type('osd'))
+
+    TEUTHDIR = teuthology.get_testdir(ctx)
+    DATADIR = os.path.join(TEUTHDIR, "ceph.data")
+    DATALINECOUNT = 10000
+    ERRORS = 0
+    NUM_OBJECTS = config.get('objects', 10)
+    log.info("objects: {num}".format(num=NUM_OBJECTS))
+
+    pool_dump = manager.get_pool_dump(REP_POOL)
+    REPID = pool_dump['pool']
+
+    log.debug("repid={num}".format(num=REPID))
+
+    db = {}
+
+    LOCALDIR = tempfile.mkdtemp("cod")
+
+    cod_setup_local_data(log, ctx, NUM_OBJECTS, LOCALDIR,
+                         REP_NAME, DATALINECOUNT)
+    allremote = []
+    allremote.append(cli_remote)
+    allremote += list(osds.remotes.keys())
+    allremote = list(set(allremote))
+    for remote in allremote:
+        cod_setup_remote_data(log, ctx, remote, NUM_OBJECTS, DATADIR,
+                              REP_NAME, DATALINECOUNT)
+
+    ERRORS += cod_setup(log, ctx, cli_remote, NUM_OBJECTS, DATADIR,
+                        REP_NAME, DATALINECOUNT, REP_POOL, db, ec)
+
+    pgs = {}
+    for stats in manager.get_pg_stats():
+        if stats["pgid"].find(str(REPID) + ".") != 0:
+            continue
+        if pool_dump["type"] == ceph_manager.PoolType.REPLICATED:
+            for osd in stats["acting"]:
+                pgs.setdefault(osd, []).append(stats["pgid"])
+        elif pool_dump["type"] == ceph_manager.PoolType.ERASURE_CODED:
+            shard = 0
+            for osd in stats["acting"]:
+                pgs.setdefault(osd, []).append("{pgid}s{shard}".
+                                               format(pgid=stats["pgid"],
+                                                      shard=shard))
+                shard += 1
+        else:
+            raise Exception("{pool} has an unexpected type {type}".
+                            format(pool=REP_POOL, type=pool_dump["type"]))
+
+    log.info(pgs)
+    log.info(db)
+
+    for osd in manager.get_osd_status()['up']:
+        manager.kill_osd(osd)
+    time.sleep(5)
+
+    pgswithobjects = set()
+    objsinpg = {}
+
+    # Test --op list and generate json for all objects
+    log.info("Test --op list by generating json for all objects")
+    prefix = ("sudo ceph-objectstore-tool "
+              "--data-path {fpath} "
+              "--journal-path {jpath} ").format(fpath=FSPATH, jpath=JPATH)
+    for remote in osds.remotes.keys():
+        log.debug(remote)
+        log.debug(osds.remotes[remote])
+        for role in osds.remotes[remote]:
+            if not role.startswith("osd."):
+                continue
+            osdid = int(role.split('.')[1])
+            log.info("process osd.{id} on {remote}".
+                     format(id=osdid, remote=remote))
+            cmd = (prefix + "--op list").format(id=osdid)
+            try:
+                lines = remote.sh(cmd, check_status=False).splitlines()
+                for pgline in lines:
+                    if not pgline:
+                        continue
+                    (pg, obj) = json.loads(pgline)
+                    name = obj['oid']
+                    if name in db:
+                        pgswithobjects.add(pg)
+                        objsinpg.setdefault(pg, []).append(name)
+                        db[name].setdefault("pg2json",
+                                            {})[pg] = json.dumps(obj)
+            except CommandFailedError as e:
+                log.error("Bad exit status {ret} from --op list request".
+                          format(ret=e.exitstatus))
+                ERRORS += 1
+
+    log.info(db)
+    log.info(pgswithobjects)
+    log.info(objsinpg)
+
+    if pool_dump["type"] == ceph_manager.PoolType.REPLICATED:
+        # Test get-bytes
+        log.info("Test get-bytes and set-bytes")
+        for basename in db.keys():
+            file = os.path.join(DATADIR, basename)
+            GETNAME = os.path.join(DATADIR, "get")
+            SETNAME = os.path.join(DATADIR, "set")
+
+            for remote in osds.remotes.keys():
+                for role in osds.remotes[remote]:
+                    if not role.startswith("osd."):
+                        continue
+                    osdid = int(role.split('.')[1])
+                    if osdid not in pgs:
+                        continue
+
+                    for pg, JSON in db[basename]["pg2json"].items():
+                        if pg in pgs[osdid]:
+                            cmd = ((prefix + "--pgid {pg}").
+                                   format(id=osdid, pg=pg).split())
+                            cmd.append(run.Raw("'{json}'".format(json=JSON)))
+                            cmd += ("get-bytes {fname}".
+                                    format(fname=GETNAME).split())
+                            proc = remote.run(args=cmd, check_status=False)
+                            if proc.exitstatus != 0:
+                                remote.run(args="rm -f {getfile}".
+                                           format(getfile=GETNAME).split())
+                                log.error("Bad exit status {ret}".
+                                          format(ret=proc.exitstatus))
+                                ERRORS += 1
+                                continue
+                            cmd = ("diff -q {file} {getfile}".
+                                   format(file=file, getfile=GETNAME))
+                            proc = remote.run(args=cmd.split())
+                            if proc.exitstatus != 0:
+                                log.error("Data from get-bytes differ")
+                                # log.debug("Got:")
+                                # cat_file(logging.DEBUG, GETNAME)
+                                # log.debug("Expected:")
+                                # cat_file(logging.DEBUG, file)
+                                ERRORS += 1
+                            remote.run(args="rm -f {getfile}".
+                                       format(getfile=GETNAME).split())
+
+                            data = ("put-bytes going into {file}\n".
+                                    format(file=file))
+                            remote.write_file(SETNAME, data)
+                            cmd = ((prefix + "--pgid {pg}").
+                                   format(id=osdid, pg=pg).split())
+                            cmd.append(run.Raw("'{json}'".format(json=JSON)))
+                            cmd += ("set-bytes {fname}".
+                                    format(fname=SETNAME).split())
+                            proc = remote.run(args=cmd, check_status=False)
+                            proc.wait()
+                            if proc.exitstatus != 0:
+                                log.info("set-bytes failed for object {obj} "
+                                         "in pg {pg} osd.{id} ret={ret}".
+                                         format(obj=basename, pg=pg,
+                                                id=osdid, ret=proc.exitstatus))
+                                ERRORS += 1
+
+                            cmd = ((prefix + "--pgid {pg}").
+                                   format(id=osdid, pg=pg).split())
+                            cmd.append(run.Raw("'{json}'".format(json=JSON)))
+                            cmd += "get-bytes -".split()
+                            try:
+                                output = remote.sh(cmd, wait=True)
+                                if data != output:
+                                    log.error("Data inconsistent after "
+                                              "set-bytes, got:")
+                                    log.error(output)
+                                    ERRORS += 1
+                            except CommandFailedError as e:
+                                log.error("get-bytes after "
+                                          "set-bytes ret={ret}".
+                                          format(ret=e.exitstatus))
+                                ERRORS += 1
+
+                            cmd = ((prefix + "--pgid {pg}").
+                                   format(id=osdid, pg=pg).split())
+                            cmd.append(run.Raw("'{json}'".format(json=JSON)))
+                            cmd += ("set-bytes {fname}".
+                                    format(fname=file).split())
+                            proc = remote.run(args=cmd, check_status=False)
+                            proc.wait()
+                            if proc.exitstatus != 0:
+                                log.info("set-bytes failed for object {obj} "
+                                         "in pg {pg} osd.{id} ret={ret}".
+                                         format(obj=basename, pg=pg,
+                                                id=osdid, ret=proc.exitstatus))
+                                ERRORS += 1
+
+    log.info("Test list-attrs get-attr")
+    for basename in db.keys():
+        file = os.path.join(DATADIR, basename)
+        GETNAME = os.path.join(DATADIR, "get")
+        SETNAME = os.path.join(DATADIR, "set")
+
+        for remote in osds.remotes.keys():
+            for role in osds.remotes[remote]:
+                if not role.startswith("osd."):
+                    continue
+                osdid = int(role.split('.')[1])
+                if osdid not in pgs:
+                    continue
+
+                for pg, JSON in db[basename]["pg2json"].items():
+                    if pg in pgs[osdid]:
+                        cmd = ((prefix + "--pgid {pg}").
+                               format(id=osdid, pg=pg).split())
+                        cmd.append(run.Raw("'{json}'".format(json=JSON)))
+                        cmd += ["list-attrs"]
+                        try:
+                            keys = remote.sh(cmd, wait=True, stderr=BytesIO()).split()
+                        except CommandFailedError as e:
+                            log.error("Bad exit status {ret}".
+                                      format(ret=e.exitstatus))
+                            ERRORS += 1
+                            continue
+                        values = dict(db[basename]["xattr"])
+
+                        for key in keys:
+                            if (key == "_" or
+                                    key == "snapset" or
+                                    key == "hinfo_key"):
+                                continue
+                            key = key.strip("_")
+                            if key not in values:
+                                log.error("The key {key} should be present".
+                                          format(key=key))
+                                ERRORS += 1
+                                continue
+                            exp = values.pop(key)
+                            cmd = ((prefix + "--pgid {pg}").
+                                   format(id=osdid, pg=pg).split())
+                            cmd.append(run.Raw("'{json}'".format(json=JSON)))
+                            cmd += ("get-attr {key}".
+                                    format(key="_" + key).split())
+                            try:
+                                val = remote.sh(cmd, wait=True)
+                            except CommandFailedError as e:
+                                log.error("get-attr failed with {ret}".
+                                          format(ret=e.exitstatus))
+                                ERRORS += 1
+                                continue
+                            if exp != val:
+                                log.error("For key {key} got value {got} "
+                                          "instead of {expected}".
+                                          format(key=key, got=val,
+                                                 expected=exp))
+                                ERRORS += 1
+                        if "hinfo_key" in keys:
+                            cmd_prefix = prefix.format(id=osdid)
+                            cmd = """
+      expected=$({prefix} --pgid {pg} '{json}' get-attr {key} | base64)
+      echo placeholder | {prefix} --pgid {pg} '{json}' set-attr {key} -
+      test $({prefix} --pgid {pg} '{json}' get-attr {key}) = placeholder
+      echo $expected | base64 --decode | \
+         {prefix} --pgid {pg} '{json}' set-attr {key} -
+      test $({prefix} --pgid {pg} '{json}' get-attr {key} | base64) = $expected
+                            """.format(prefix=cmd_prefix, pg=pg, json=JSON,
+                                       key="hinfo_key")
+                            log.debug(cmd)
+                            proc = remote.run(args=['bash', '-e', '-x',
+                                                    '-c', cmd],
+                                              check_status=False,
+                                              stdout=BytesIO(),
+                                              stderr=BytesIO())
+                            proc.wait()
+                            if proc.exitstatus != 0:
+                                log.error("failed with " +
+                                          str(proc.exitstatus))
+                                log.error(" ".join([
+                                    proc.stdout.getvalue().decode(),
+                                    proc.stderr.getvalue().decode(),
+                                    ]))
+                                ERRORS += 1
+
+                        if len(values) != 0:
+                            log.error("Not all keys found, remaining keys:")
+                            log.error(values)
+
+    log.info("Test pg info")
+    for remote in osds.remotes.keys():
+        for role in osds.remotes[remote]:
+            if not role.startswith("osd."):
+                continue
+            osdid = int(role.split('.')[1])
+            if osdid not in pgs:
+                continue
+
+            for pg in pgs[osdid]:
+                cmd = ((prefix + "--op info --pgid {pg}").
+                       format(id=osdid, pg=pg).split())
+                try:
+                    info = remote.sh(cmd, wait=True)
+                except CommandFailedError as e:
+                    log.error("Failure of --op info command with %s",
+                              e.exitstatus)
+                    ERRORS += 1
+                    continue
+                if not str(pg) in info:
+                    log.error("Bad data from info: %s", info)
+                    ERRORS += 1
+
+    log.info("Test pg logging")
+    for remote in osds.remotes.keys():
+        for role in osds.remotes[remote]:
+            if not role.startswith("osd."):
+                continue
+            osdid = int(role.split('.')[1])
+            if osdid not in pgs:
+                continue
+
+            for pg in pgs[osdid]:
+                cmd = ((prefix + "--op log --pgid {pg}").
+                       format(id=osdid, pg=pg).split())
+                try:
+                    output = remote.sh(cmd, wait=True)
+                except CommandFailedError as e:
+                    log.error("Getting log failed for pg {pg} "
+                              "from osd.{id} with {ret}".
+                              format(pg=pg, id=osdid, ret=e.exitstatus))
+                    ERRORS += 1
+                    continue
+                HASOBJ = pg in pgswithobjects
+                MODOBJ = "modify" in output
+                if HASOBJ != MODOBJ:
+                    log.error("Bad log for pg {pg} from osd.{id}".
+                              format(pg=pg, id=osdid))
+                    MSG = (HASOBJ and [""] or ["NOT "])[0]
+                    log.error("Log should {msg}have a modify entry".
+                              format(msg=MSG))
+                    ERRORS += 1
+
+    log.info("Test pg export")
+    EXP_ERRORS = 0
+    for remote in osds.remotes.keys():
+        for role in osds.remotes[remote]:
+            if not role.startswith("osd."):
+                continue
+            osdid = int(role.split('.')[1])
+            if osdid not in pgs:
+                continue
+
+            for pg in pgs[osdid]:
+                fpath = os.path.join(DATADIR, "osd{id}.{pg}".
+                                     format(id=osdid, pg=pg))
+
+                cmd = ((prefix + "--op export --pgid {pg} --file {file}").
+                       format(id=osdid, pg=pg, file=fpath))
+                try:
+                    remote.sh(cmd, wait=True)
+                except CommandFailedError as e:
+                    log.error("Exporting failed for pg {pg} "
+                              "on osd.{id} with {ret}".
+                              format(pg=pg, id=osdid, ret=e.exitstatus))
+                    EXP_ERRORS += 1
+
+    ERRORS += EXP_ERRORS
+
+    log.info("Test pg removal")
+    RM_ERRORS = 0
+    for remote in osds.remotes.keys():
+        for role in osds.remotes[remote]:
+            if not role.startswith("osd."):
+                continue
+            osdid = int(role.split('.')[1])
+            if osdid not in pgs:
+                continue
+
+            for pg in pgs[osdid]:
+                cmd = ((prefix + "--force --op remove --pgid {pg}").
+                       format(pg=pg, id=osdid))
+                try:
+                    remote.sh(cmd, wait=True)
+                except CommandFailedError as e:
+                    log.error("Removing failed for pg {pg} "
+                              "on osd.{id} with {ret}".
+                              format(pg=pg, id=osdid, ret=e.exitstatus))
+                    RM_ERRORS += 1
+
+    ERRORS += RM_ERRORS
+
+    IMP_ERRORS = 0
+    if EXP_ERRORS == 0 and RM_ERRORS == 0:
+        log.info("Test pg import")
+
+        for remote in osds.remotes.keys():
+            for role in osds.remotes[remote]:
+                if not role.startswith("osd."):
+                    continue
+                osdid = int(role.split('.')[1])
+                if osdid not in pgs:
+                    continue
+
+                for pg in pgs[osdid]:
+                    fpath = os.path.join(DATADIR, "osd{id}.{pg}".
+                                         format(id=osdid, pg=pg))
+
+                    cmd = ((prefix + "--op import --file {file}").
+                           format(id=osdid, file=fpath))
+                    try:
+                        remote.sh(cmd, wait=True)
+                    except CommandFailedError as e:
+                        log.error("Import failed from {file} with {ret}".
+                                  format(file=fpath, ret=e.exitstatus))
+                        IMP_ERRORS += 1
+    else:
+        log.warning("SKIPPING IMPORT TESTS DUE TO PREVIOUS FAILURES")
+
+    ERRORS += IMP_ERRORS
+
+    if EXP_ERRORS == 0 and RM_ERRORS == 0 and IMP_ERRORS == 0:
+        log.info("Restarting OSDs....")
+        # They are still look to be up because of setting nodown
+        for osd in manager.get_osd_status()['up']:
+            manager.revive_osd(osd)
+        # Wait for health?
+        time.sleep(5)
+        # Let scrub after test runs verify consistency of all copies
+        log.info("Verify replicated import data")
+        objects = range(1, NUM_OBJECTS + 1)
+        for i in objects:
+            NAME = REP_NAME + "{num}".format(num=i)
+            TESTNAME = os.path.join(DATADIR, "gettest")
+            REFNAME = os.path.join(DATADIR, NAME)
+
+            proc = rados(ctx, cli_remote,
+                         ['-p', REP_POOL, 'get', NAME, TESTNAME], wait=False)
+
+            ret = proc.wait()
+            if ret != 0:
+                log.error("After import, rados get failed with {ret}".
+                          format(ret=proc.exitstatus))
+                ERRORS += 1
+                continue
+
+            cmd = "diff -q {gettest} {ref}".format(gettest=TESTNAME,
+                                                   ref=REFNAME)
+            proc = cli_remote.run(args=cmd, check_status=False)
+            proc.wait()
+            if proc.exitstatus != 0:
+                log.error("Data comparison failed for {obj}".format(obj=NAME))
+                ERRORS += 1
+
+    return ERRORS
diff --git a/qa/tasks/ceph_test_case.py b/qa/tasks/ceph_test_case.py
new file mode 100644
index 000000000..0de395c06
--- /dev/null
+++ b/qa/tasks/ceph_test_case.py
@@ -0,0 +1,215 @@
+from typing import Optional, TYPE_CHECKING
+import unittest
+import time
+import logging
+
+from teuthology.orchestra.run import CommandFailedError
+
+if TYPE_CHECKING:
+    from tasks.mgr.mgr_test_case import MgrCluster
+
+log = logging.getLogger(__name__)
+
+class TestTimeoutError(RuntimeError):
+    pass
+
+class CephTestCase(unittest.TestCase):
+    """
+    For test tasks that want to define a structured set of
+    tests implemented in python.  Subclass this with appropriate
+    helpers for the subsystem you're testing.
+    """
+
+    # Environment references
+    mounts = None
+    fs = None
+    recovery_fs = None
+    backup_fs = None
+    ceph_cluster = None
+    mds_cluster = None
+    mgr_cluster: Optional['MgrCluster'] = None
+    ctx = None
+
+    mon_manager = None
+
+    # Declarative test requirements: subclasses should override these to indicate
+    # their special needs.  If not met, tests will be skipped.
+    REQUIRE_MEMSTORE = False
+
+    def setUp(self):
+        self._mon_configs_set = set()
+
+        self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
+            "Starting test {0}".format(self.id()))
+
+        if self.REQUIRE_MEMSTORE:
+            objectstore = self.ceph_cluster.get_config("osd_objectstore", "osd")
+            if objectstore != "memstore":
+                # You certainly *could* run this on a real OSD, but you don't want to sit
+                # here for hours waiting for the test to fill up a 1TB drive!
+                raise self.skipTest("Require `memstore` OSD backend (test " \
+                        "would take too long on full sized OSDs")
+
+    def tearDown(self):
+        self.config_clear()
+
+        self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
+            "Ended test {0}".format(self.id()))
+
+    def config_clear(self):
+        for section, key in self._mon_configs_set:
+            self.config_rm(section, key)
+        self._mon_configs_set.clear()
+
+    def _fix_key(self, key):
+        return str(key).replace(' ', '_')
+
+    def config_get(self, section, key):
+       key = self._fix_key(key)
+       return self.ceph_cluster.mon_manager.raw_cluster_cmd("config", "get", section, key).strip()
+
+    def config_show(self, entity, key):
+       key = self._fix_key(key)
+       return self.ceph_cluster.mon_manager.raw_cluster_cmd("config", "show", entity, key).strip()
+
+    def config_minimal(self):
+       return self.ceph_cluster.mon_manager.raw_cluster_cmd("config", "generate-minimal-conf").strip()
+
+    def config_rm(self, section, key):
+       key = self._fix_key(key)
+       self.ceph_cluster.mon_manager.raw_cluster_cmd("config", "rm", section, key)
+       # simplification: skip removing from _mon_configs_set;
+       # let tearDown clear everything again
+
+    def config_set(self, section, key, value):
+       key = self._fix_key(key)
+       self._mon_configs_set.add((section, key))
+       self.ceph_cluster.mon_manager.raw_cluster_cmd("config", "set", section, key, str(value))
+
+    def cluster_cmd(self, command: str):
+        assert self.ceph_cluster is not None
+        return self.ceph_cluster.mon_manager.raw_cluster_cmd(*(command.split(" ")))
+
+
+    def assert_cluster_log(self, expected_pattern, invert_match=False,
+                           timeout=10, watch_channel=None):
+        """
+        Context manager.  Assert that during execution, or up to 5 seconds later,
+        the Ceph cluster log emits a message matching the expected pattern.
+
+        :param expected_pattern: A string that you expect to see in the log output
+        :type expected_pattern: str
+        :param watch_channel: Specifies the channel to be watched. This can be
+                              'cluster', 'audit', ...
+        :type watch_channel: str
+        """
+
+        ceph_manager = self.ceph_cluster.mon_manager
+
+        class ContextManager(object):
+            def match(self):
+                found = expected_pattern in self.watcher_process.stdout.getvalue()
+                if invert_match:
+                    return not found
+
+                return found
+
+            def __enter__(self):
+                self.watcher_process = ceph_manager.run_ceph_w(watch_channel)
+
+            def __exit__(self, exc_type, exc_val, exc_tb):
+                if not self.watcher_process.finished:
+                    # Check if we got an early match, wait a bit if we didn't
+                    if self.match():
+                        return
+                    else:
+                        log.debug("No log hits yet, waiting...")
+                        # Default monc tick interval is 10s, so wait that long and
+                        # then some grace
+                        time.sleep(5 + timeout)
+
+                self.watcher_process.stdin.close()
+                try:
+                    self.watcher_process.wait()
+                except CommandFailedError:
+                    pass
+
+                if not self.match():
+                    log.error("Log output: \n{0}\n".format(self.watcher_process.stdout.getvalue()))
+                    raise AssertionError("Expected log message not found: '{0}'".format(expected_pattern))
+
+        return ContextManager()
+
+    def wait_for_health(self, pattern, timeout):
+        """
+        Wait until 'ceph health' contains messages matching the pattern
+        """
+        def seen_health_warning():
+            health = self.ceph_cluster.mon_manager.get_mon_health()
+            codes = [s for s in health['checks']]
+            summary_strings = [s[1]['summary']['message'] for s in health['checks'].items()]
+            if len(summary_strings) == 0:
+                log.debug("Not expected number of summary strings ({0})".format(summary_strings))
+                return False
+            else:
+                for ss in summary_strings:
+                    if pattern in ss:
+                         return True
+                if pattern in codes:
+                    return True
+
+            log.debug("Not found expected summary strings yet ({0})".format(summary_strings))
+            return False
+
+        self.wait_until_true(seen_health_warning, timeout)
+
+    def wait_for_health_clear(self, timeout):
+        """
+        Wait until `ceph health` returns no messages
+        """
+        def is_clear():
+            health = self.ceph_cluster.mon_manager.get_mon_health()
+            return len(health['checks']) == 0
+
+        self.wait_until_true(is_clear, timeout)
+
+    def wait_until_equal(self, get_fn, expect_val, timeout, reject_fn=None, period=5):
+        elapsed = 0
+        while True:
+            val = get_fn()
+            if val == expect_val:
+                return
+            elif reject_fn and reject_fn(val):
+                raise RuntimeError("wait_until_equal: forbidden value {0} seen".format(val))
+            else:
+                if elapsed >= timeout:
+                    raise TestTimeoutError("Timed out after {0} seconds waiting for {1} (currently {2})".format(
+                        elapsed, expect_val, val
+                    ))
+                else:
+                    log.debug("wait_until_equal: {0} != {1}, waiting (timeout={2})...".format(val, expect_val, timeout))
+                time.sleep(period)
+                elapsed += period
+
+        log.debug("wait_until_equal: success")
+
+    @classmethod
+    def wait_until_true(cls, condition, timeout, check_fn=None, period=5):
+        elapsed = 0
+        retry_count = 0
+        while True:
+            if condition():
+                log.debug("wait_until_true: success in {0}s and {1} retries".format(elapsed, retry_count))
+                return
+            else:
+                if elapsed >= timeout:
+                    if check_fn and check_fn() and retry_count < 5:
+                        elapsed = 0
+                        retry_count += 1
+                        log.debug("wait_until_true: making progress, waiting (timeout={0} retry_count={1})...".format(timeout, retry_count))
+                    else:
+                        raise TestTimeoutError("Timed out after {0}s and {1} retries".format(elapsed, retry_count))
+                else:
+                    log.debug("wait_until_true: waiting (timeout={0} retry_count={1})...".format(timeout, retry_count))
+                time.sleep(period)
+                elapsed += period
diff --git a/qa/tasks/cephadm.conf b/qa/tasks/cephadm.conf
new file mode 100644
index 000000000..bd1ab821e
--- /dev/null
+++ b/qa/tasks/cephadm.conf
@@ -0,0 +1,89 @@
+[global]
+# make logging friendly to teuthology
+log_to_file = true
+log_to_stderr = false
+mon cluster log file level = debug
+
+mon clock drift allowed = 1.000
+
+# replicate across OSDs, not hosts
+osd crush chooseleaf type = 0
+#osd pool default size = 2
+osd pool default erasure code profile = "plugin=jerasure technique=reed_sol_van k=2 m=1 ruleset-failure-domain=osd crush-failure-domain=osd"
+
+# enable some debugging
+auth debug = true
+ms die on old message = true
+ms die on bug = true
+debug asserts on shutdown = true
+
+# adjust warnings
+mon max pg per osd = 10000        # >= luminous
+mon pg warn max object skew = 0
+mon osd allow primary affinity = true
+mon osd allow pg remap = true
+mon warn on legacy crush tunables = false
+mon warn on crush straw calc version zero = false
+mon warn on no sortbitwise = false
+mon warn on osd down out interval zero = false
+mon warn on too few osds = false
+mon_warn_on_pool_pg_num_not_power_of_two = false
+
+# disable pg_autoscaler by default for new pools
+osd_pool_default_pg_autoscale_mode = off
+
+# tests delete pools
+mon allow pool delete = true
+
+[osd]
+osd scrub load threshold = 5.0
+osd scrub max interval = 600
+
+osd recover clone overlap = true
+osd recovery max chunk = 1048576
+
+osd deep scrub update digest min age = 30
+
+osd map max advance = 10
+
+osd memory target autotune = true
+
+# debugging
+osd debug shutdown = true
+osd debug op order = true
+osd debug verify stray on activate = true
+osd debug pg log writeout = true
+osd debug verify cached snaps = true
+osd debug verify missing on start = true
+osd debug misdirected ops = true
+osd op queue = debug_random
+osd op queue cut off = debug_random
+osd shutdown pgref assert = true
+bdev debug aio = true
+osd sloppy crc = true
+
+[mgr]
+mon reweight min pgs per osd = 4
+mon reweight min bytes per osd = 10
+mgr/telemetry/nag = false
+
+[mon]
+mon data avail warn = 5
+mon mgr mkfs grace = 240
+mon reweight min pgs per osd = 4
+mon osd reporter subtree level = osd
+mon osd prime pg temp = true
+mon reweight min bytes per osd = 10
+
+# rotate auth tickets quickly to exercise renewal paths
+auth mon ticket ttl = 660      # 11m
+auth service ticket ttl = 240  # 4m
+
+# don't complain about global id reclaim
+mon_warn_on_insecure_global_id_reclaim = false
+mon_warn_on_insecure_global_id_reclaim_allowed = false
+
+[client.rgw]
+rgw cache enabled = true
+rgw enable ops log = true
+rgw enable usage log = true
diff --git a/qa/tasks/cephadm.py b/qa/tasks/cephadm.py
new file mode 100644
index 000000000..8bd7f220c
--- /dev/null
+++ b/qa/tasks/cephadm.py
@@ -0,0 +1,1503 @@
+"""
+Ceph cluster task, deployed via cephadm orchestrator
+"""
+import argparse
+import configobj
+import contextlib
+import logging
+import os
+import json
+import re
+import uuid
+import yaml
+
+from io import BytesIO, StringIO
+from tarfile import ReadError
+from tasks.ceph_manager import CephManager
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.orchestra import run
+from teuthology.orchestra.daemon import DaemonGroup
+from teuthology.config import config as teuth_config
+
+# these items we use from ceph.py should probably eventually move elsewhere
+from tasks.ceph import get_mons, healthy
+from tasks.vip import subst_vip
+
+CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw', 'prometheus']
+
+log = logging.getLogger(__name__)
+
+
+def _shell(ctx, cluster_name, remote, args, extra_cephadm_args=[], **kwargs):
+    teuthology.get_testdir(ctx)
+    return remote.run(
+        args=[
+            'sudo',
+            ctx.cephadm,
+            '--image', ctx.ceph[cluster_name].image,
+            'shell',
+            '-c', '/etc/ceph/{}.conf'.format(cluster_name),
+            '-k', '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
+            '--fsid', ctx.ceph[cluster_name].fsid,
+            ] + extra_cephadm_args + [
+            '--',
+            ] + args,
+        **kwargs
+    )
+
+
+def build_initial_config(ctx, config):
+    cluster_name = config['cluster']
+
+    path = os.path.join(os.path.dirname(__file__), 'cephadm.conf')
+    conf = configobj.ConfigObj(path, file_error=True)
+
+    conf.setdefault('global', {})
+    conf['global']['fsid'] = ctx.ceph[cluster_name].fsid
+
+    # overrides
+    for section, keys in config.get('conf',{}).items():
+        for key, value in keys.items():
+            log.info(" override: [%s] %s = %s" % (section, key, value))
+            if section not in conf:
+                conf[section] = {}
+            conf[section][key] = value
+
+    return conf
+
+
+def update_archive_setting(ctx, key, value):
+    """
+    Add logs directory to job's info log file
+    """
+    if ctx.archive is None:
+        return
+    with open(os.path.join(ctx.archive, 'info.yaml'), 'r+') as info_file:
+        info_yaml = yaml.safe_load(info_file)
+        info_file.seek(0)
+        if 'archive' in info_yaml:
+            info_yaml['archive'][key] = value
+        else:
+            info_yaml['archive'] = {key: value}
+        yaml.safe_dump(info_yaml, info_file, default_flow_style=False)
+
+
+@contextlib.contextmanager
+def normalize_hostnames(ctx):
+    """
+    Ensure we have short hostnames throughout, for consistency between
+    remote.shortname and socket.gethostname() in cephadm.
+    """
+    log.info('Normalizing hostnames...')
+    ctx.cluster.run(args=[
+        'sudo',
+        'hostname',
+        run.Raw('$(hostname -s)'),
+    ])
+
+    try:
+        yield
+    finally:
+        pass
+
+
+@contextlib.contextmanager
+def download_cephadm(ctx, config, ref):
+    cluster_name = config['cluster']
+
+    if config.get('cephadm_mode') != 'cephadm-package':
+        ref = config.get('cephadm_branch', ref)
+        git_url = config.get('cephadm_git_url', teuth_config.get_ceph_git_url())
+        log.info('Downloading cephadm (repo %s ref %s)...' % (git_url, ref))
+        if ctx.config.get('redhat'):
+            log.info("Install cephadm using RPM")
+            # cephadm already installed from redhat.install task
+            ctx.cluster.run(
+                args=[
+                    'cp',
+                    run.Raw('$(which cephadm)'),
+                    ctx.cephadm,
+                    run.Raw('&&'),
+                    'ls', '-l',
+                    ctx.cephadm,
+                ]
+            )
+        elif git_url.startswith('https://github.com/'):
+            # git archive doesn't like https:// URLs, which we use with github.
+            rest = git_url.split('https://github.com/', 1)[1]
+            rest = re.sub(r'\.git/?$', '', rest).strip() # no .git suffix
+            ctx.cluster.run(
+                args=[
+                    'curl', '--silent',
+                    'https://raw.githubusercontent.com/' + rest + '/' + ref + '/src/cephadm/cephadm',
+                    run.Raw('>'),
+                    ctx.cephadm,
+                    run.Raw('&&'),
+                    'ls', '-l',
+                    ctx.cephadm,
+                ],
+            )
+        else:
+            ctx.cluster.run(
+                args=[
+                    'git', 'archive',
+                    '--remote=' + git_url,
+                    ref,
+                    'src/cephadm/cephadm',
+                    run.Raw('|'),
+                    'tar', '-xO', 'src/cephadm/cephadm',
+                    run.Raw('>'),
+                    ctx.cephadm,
+                ],
+            )
+        # sanity-check the resulting file and set executable bit
+        cephadm_file_size = '$(stat -c%s {})'.format(ctx.cephadm)
+        ctx.cluster.run(
+            args=[
+                'test', '-s', ctx.cephadm,
+                run.Raw('&&'),
+                'test', run.Raw(cephadm_file_size), "-gt", run.Raw('1000'),
+                run.Raw('&&'),
+                'chmod', '+x', ctx.cephadm,
+            ],
+        )
+
+    try:
+        yield
+    finally:
+        log.info('Removing cluster...')
+        ctx.cluster.run(args=[
+            'sudo',
+            ctx.cephadm,
+            'rm-cluster',
+            '--fsid', ctx.ceph[cluster_name].fsid,
+            '--force',
+        ])
+
+        if config.get('cephadm_mode') == 'root':
+            log.info('Removing cephadm ...')
+            ctx.cluster.run(
+                args=[
+                    'rm',
+                    '-rf',
+                    ctx.cephadm,
+                ],
+            )
+
+
+@contextlib.contextmanager
+def ceph_log(ctx, config):
+    cluster_name = config['cluster']
+    fsid = ctx.ceph[cluster_name].fsid
+
+    update_archive_setting(ctx, 'log', '/var/log/ceph')
+
+
+    try:
+        yield
+
+    except Exception:
+        # we need to know this below
+        ctx.summary['success'] = False
+        raise
+
+    finally:
+        log.info('Checking cluster log for badness...')
+        def first_in_ceph_log(pattern, excludes):
+            """
+            Find the first occurrence of the pattern specified in the Ceph log,
+            Returns None if none found.
+
+            :param pattern: Pattern scanned for.
+            :param excludes: Patterns to ignore.
+            :return: First line of text (or None if not found)
+            """
+            args = [
+                'sudo',
+                'egrep', pattern,
+                '/var/log/ceph/{fsid}/ceph.log'.format(
+                    fsid=fsid),
+            ]
+            if excludes:
+                for exclude in excludes:
+                    args.extend([run.Raw('|'), 'egrep', '-v', exclude])
+            args.extend([
+                run.Raw('|'), 'head', '-n', '1',
+            ])
+            r = ctx.ceph[cluster_name].bootstrap_remote.run(
+                stdout=StringIO(),
+                args=args,
+            )
+            stdout = r.stdout.getvalue()
+            if stdout != '':
+                return stdout
+            return None
+
+        if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
+                             config.get('log-ignorelist')) is not None:
+            log.warning('Found errors (ERR|WRN|SEC) in cluster log')
+            ctx.summary['success'] = False
+            # use the most severe problem as the failure reason
+            if 'failure_reason' not in ctx.summary:
+                for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
+                    match = first_in_ceph_log(pattern, config['log-ignorelist'])
+                    if match is not None:
+                        ctx.summary['failure_reason'] = \
+                            '"{match}" in cluster log'.format(
+                                match=match.rstrip('\n'),
+                            )
+                        break
+
+        if ctx.archive is not None and \
+                not (ctx.config.get('archive-on-error') and ctx.summary['success']):
+            # and logs
+            log.info('Compressing logs...')
+            run.wait(
+                ctx.cluster.run(
+                    args=[
+                        'sudo',
+                        'find',
+                        '/var/log/ceph',   # all logs, not just for the cluster
+                        '/var/log/rbd-target-api', # ceph-iscsi
+                        '-name',
+                        '*.log',
+                        '-print0',
+                        run.Raw('|'),
+                        'sudo',
+                        'xargs',
+                        '-0',
+                        '--no-run-if-empty',
+                        '--',
+                        'gzip',
+                        '--',
+                    ],
+                    wait=False,
+                ),
+            )
+
+            log.info('Archiving logs...')
+            path = os.path.join(ctx.archive, 'remote')
+            try:
+                os.makedirs(path)
+            except OSError:
+                pass
+            for remote in ctx.cluster.remotes.keys():
+                sub = os.path.join(path, remote.name)
+                try:
+                    os.makedirs(sub)
+                except OSError:
+                    pass
+                try:
+                    teuthology.pull_directory(remote, '/var/log/ceph',  # everything
+                                              os.path.join(sub, 'log'))
+                except ReadError:
+                    pass
+
+
+@contextlib.contextmanager
+def ceph_crash(ctx, config):
+    """
+    Gather crash dumps from /var/lib/ceph/$fsid/crash
+    """
+    cluster_name = config['cluster']
+    fsid = ctx.ceph[cluster_name].fsid
+
+    update_archive_setting(ctx, 'crash', '/var/lib/ceph/crash')
+
+    try:
+        yield
+
+    finally:
+        if ctx.archive is not None:
+            log.info('Archiving crash dumps...')
+            path = os.path.join(ctx.archive, 'remote')
+            try:
+                os.makedirs(path)
+            except OSError:
+                pass
+            for remote in ctx.cluster.remotes.keys():
+                sub = os.path.join(path, remote.name)
+                try:
+                    os.makedirs(sub)
+                except OSError:
+                    pass
+                try:
+                    teuthology.pull_directory(remote,
+                                              '/var/lib/ceph/%s/crash' % fsid,
+                                              os.path.join(sub, 'crash'))
+                except ReadError:
+                    pass
+
+
+@contextlib.contextmanager
+def ceph_bootstrap(ctx, config):
+    """
+    Bootstrap ceph cluster.
+
+    :param ctx: the argparse.Namespace object
+    :param config: the config dict
+    """
+    cluster_name = config['cluster']
+    testdir = teuthology.get_testdir(ctx)
+    fsid = ctx.ceph[cluster_name].fsid
+
+    bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
+    first_mon = ctx.ceph[cluster_name].first_mon
+    first_mon_role = ctx.ceph[cluster_name].first_mon_role
+    mons = ctx.ceph[cluster_name].mons
+
+    ctx.cluster.run(args=[
+        'sudo', 'mkdir', '-p', '/etc/ceph',
+        ]);
+    ctx.cluster.run(args=[
+        'sudo', 'chmod', '777', '/etc/ceph',
+        ]);
+    try:
+        # write seed config
+        log.info('Writing seed config...')
+        conf_fp = BytesIO()
+        seed_config = build_initial_config(ctx, config)
+        seed_config.write(conf_fp)
+        bootstrap_remote.write_file(
+            path='{}/seed.{}.conf'.format(testdir, cluster_name),
+            data=conf_fp.getvalue())
+        log.debug('Final config:\n' + conf_fp.getvalue().decode())
+        ctx.ceph[cluster_name].conf = seed_config
+
+        # register initial daemons
+        ctx.daemons.register_daemon(
+            bootstrap_remote, 'mon', first_mon,
+            cluster=cluster_name,
+            fsid=fsid,
+            logger=log.getChild('mon.' + first_mon),
+            wait=False,
+            started=True,
+        )
+        if not ctx.ceph[cluster_name].roleless:
+            first_mgr = ctx.ceph[cluster_name].first_mgr
+            ctx.daemons.register_daemon(
+                bootstrap_remote, 'mgr', first_mgr,
+                cluster=cluster_name,
+                fsid=fsid,
+                logger=log.getChild('mgr.' + first_mgr),
+                wait=False,
+                started=True,
+            )
+
+        # bootstrap
+        log.info('Bootstrapping...')
+        cmd = [
+            'sudo',
+            ctx.cephadm,
+            '--image', ctx.ceph[cluster_name].image,
+            '-v',
+            'bootstrap',
+            '--fsid', fsid,
+            '--config', '{}/seed.{}.conf'.format(testdir, cluster_name),
+            '--output-config', '/etc/ceph/{}.conf'.format(cluster_name),
+            '--output-keyring',
+            '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
+            '--output-pub-ssh-key', '{}/{}.pub'.format(testdir, cluster_name),
+        ]
+
+        if config.get('registry-login'):
+            registry = config['registry-login']
+            cmd += [
+                "--registry-url", registry['url'],
+                "--registry-username", registry['username'],
+                "--registry-password", registry['password'],
+            ]
+
+        if not ctx.ceph[cluster_name].roleless:
+            cmd += [
+                '--mon-id', first_mon,
+                '--mgr-id', first_mgr,
+                '--orphan-initial-daemons',   # we will do it explicitly!
+                '--skip-monitoring-stack',    # we'll provision these explicitly
+            ]
+
+        if mons[first_mon_role].startswith('['):
+            cmd += ['--mon-addrv', mons[first_mon_role]]
+        else:
+            cmd += ['--mon-ip', mons[first_mon_role]]
+        if config.get('skip_dashboard'):
+            cmd += ['--skip-dashboard']
+        if config.get('skip_monitoring_stack'):
+            cmd += ['--skip-monitoring-stack']
+        if config.get('single_host_defaults'):
+            cmd += ['--single-host-defaults']
+        if not config.get('avoid_pacific_features', False):
+            cmd += ['--skip-admin-label']
+        # bootstrap makes the keyring root 0600, so +r it for our purposes
+        cmd += [
+            run.Raw('&&'),
+            'sudo', 'chmod', '+r',
+            '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
+        ]
+        bootstrap_remote.run(args=cmd)
+
+        # fetch keys and configs
+        log.info('Fetching config...')
+        ctx.ceph[cluster_name].config_file = \
+            bootstrap_remote.read_file(f'/etc/ceph/{cluster_name}.conf')
+        log.info('Fetching client.admin keyring...')
+        ctx.ceph[cluster_name].admin_keyring = \
+            bootstrap_remote.read_file(f'/etc/ceph/{cluster_name}.client.admin.keyring')
+        log.info('Fetching mon keyring...')
+        ctx.ceph[cluster_name].mon_keyring = \
+            bootstrap_remote.read_file(f'/var/lib/ceph/{fsid}/mon.{first_mon}/keyring', sudo=True)
+
+        # fetch ssh key, distribute to additional nodes
+        log.info('Fetching pub ssh key...')
+        ssh_pub_key = bootstrap_remote.read_file(
+            f'{testdir}/{cluster_name}.pub').decode('ascii').strip()
+
+        log.info('Installing pub ssh key for root users...')
+        ctx.cluster.run(args=[
+            'sudo', 'install', '-d', '-m', '0700', '/root/.ssh',
+            run.Raw('&&'),
+            'echo', ssh_pub_key,
+            run.Raw('|'),
+            'sudo', 'tee', '-a', '/root/.ssh/authorized_keys',
+            run.Raw('&&'),
+            'sudo', 'chmod', '0600', '/root/.ssh/authorized_keys',
+        ])
+
+        # set options
+        if config.get('allow_ptrace', True):
+            _shell(ctx, cluster_name, bootstrap_remote,
+                   ['ceph', 'config', 'set', 'mgr', 'mgr/cephadm/allow_ptrace', 'true'])
+
+        if not config.get('avoid_pacific_features', False):
+            log.info('Distributing conf and client.admin keyring to all hosts + 0755')
+            _shell(ctx, cluster_name, bootstrap_remote,
+                   ['ceph', 'orch', 'client-keyring', 'set', 'client.admin',
+                    '*', '--mode', '0755'],
+                   check_status=False)
+
+        # add other hosts
+        for remote in ctx.cluster.remotes.keys():
+            if remote == bootstrap_remote:
+                continue
+
+            # note: this may be redundant (see above), but it avoids
+            # us having to wait for cephadm to do it.
+            log.info('Writing (initial) conf and keyring to %s' % remote.shortname)
+            remote.write_file(
+                path='/etc/ceph/{}.conf'.format(cluster_name),
+                data=ctx.ceph[cluster_name].config_file)
+            remote.write_file(
+                path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
+                data=ctx.ceph[cluster_name].admin_keyring)
+
+            log.info('Adding host %s to orchestrator...' % remote.shortname)
+            _shell(ctx, cluster_name, remote, [
+                'ceph', 'orch', 'host', 'add',
+                remote.shortname
+            ])
+            r = _shell(ctx, cluster_name, remote,
+                       ['ceph', 'orch', 'host', 'ls', '--format=json'],
+                       stdout=StringIO())
+            hosts = [node['hostname'] for node in json.loads(r.stdout.getvalue())]
+            assert remote.shortname in hosts
+
+        yield
+
+    finally:
+        log.info('Cleaning up testdir ceph.* files...')
+        ctx.cluster.run(args=[
+            'rm', '-f',
+            '{}/seed.{}.conf'.format(testdir, cluster_name),
+            '{}/{}.pub'.format(testdir, cluster_name),
+        ])
+
+        log.info('Stopping all daemons...')
+
+        # this doesn't block until they are all stopped...
+        #ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
+
+        # stop the daemons we know
+        for role in ctx.daemons.resolve_role_list(None, CEPH_ROLE_TYPES, True):
+            cluster, type_, id_ = teuthology.split_role(role)
+            try:
+                ctx.daemons.get_daemon(type_, id_, cluster).stop()
+            except Exception:
+                log.exception(f'Failed to stop "{role}"')
+                raise
+
+        # tear down anything left (but leave the logs behind)
+        ctx.cluster.run(
+            args=[
+                'sudo',
+                ctx.cephadm,
+                'rm-cluster',
+                '--fsid', fsid,
+                '--force',
+                '--keep-logs',
+            ],
+            check_status=False,  # may fail if upgrading from old cephadm
+        )
+
+        # clean up /etc/ceph
+        ctx.cluster.run(args=[
+            'sudo', 'rm', '-f',
+            '/etc/ceph/{}.conf'.format(cluster_name),
+            '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
+        ])
+
+
+@contextlib.contextmanager
+def ceph_mons(ctx, config):
+    """
+    Deploy any additional mons
+    """
+    cluster_name = config['cluster']
+    fsid = ctx.ceph[cluster_name].fsid
+
+    try:
+        daemons = {}
+        if config.get('add_mons_via_daemon_add'):
+            # This is the old way of adding mons that works with the (early) octopus
+            # cephadm scheduler.
+            num_mons = 1
+            for remote, roles in ctx.cluster.remotes.items():
+                for mon in [r for r in roles
+                            if teuthology.is_type('mon', cluster_name)(r)]:
+                    c_, _, id_ = teuthology.split_role(mon)
+                    if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
+                        continue
+                    log.info('Adding %s on %s' % (mon, remote.shortname))
+                    num_mons += 1
+                    _shell(ctx, cluster_name, remote, [
+                        'ceph', 'orch', 'daemon', 'add', 'mon',
+                        remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_,
+                    ])
+                    ctx.daemons.register_daemon(
+                        remote, 'mon', id_,
+                        cluster=cluster_name,
+                        fsid=fsid,
+                        logger=log.getChild(mon),
+                        wait=False,
+                        started=True,
+                    )
+                    daemons[mon] = (remote, id_)
+
+                    with contextutil.safe_while(sleep=1, tries=180) as proceed:
+                        while proceed():
+                            log.info('Waiting for %d mons in monmap...' % (num_mons))
+                            r = _shell(
+                                ctx=ctx,
+                                cluster_name=cluster_name,
+                                remote=remote,
+                                args=[
+                                    'ceph', 'mon', 'dump', '-f', 'json',
+                                ],
+                                stdout=StringIO(),
+                            )
+                            j = json.loads(r.stdout.getvalue())
+                            if len(j['mons']) == num_mons:
+                                break
+        else:
+            nodes = []
+            for remote, roles in ctx.cluster.remotes.items():
+                for mon in [r for r in roles
+                            if teuthology.is_type('mon', cluster_name)(r)]:
+                    c_, _, id_ = teuthology.split_role(mon)
+                    log.info('Adding %s on %s' % (mon, remote.shortname))
+                    nodes.append(remote.shortname
+                                 + ':' + ctx.ceph[cluster_name].mons[mon]
+                                 + '=' + id_)
+                    if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
+                        continue
+                    daemons[mon] = (remote, id_)
+
+            _shell(ctx, cluster_name, remote, [
+                'ceph', 'orch', 'apply', 'mon',
+                str(len(nodes)) + ';' + ';'.join(nodes)]
+                   )
+            for mgr, i in daemons.items():
+                remote, id_ = i
+                ctx.daemons.register_daemon(
+                    remote, 'mon', id_,
+                    cluster=cluster_name,
+                    fsid=fsid,
+                    logger=log.getChild(mon),
+                    wait=False,
+                    started=True,
+                )
+
+            with contextutil.safe_while(sleep=1, tries=180) as proceed:
+                while proceed():
+                    log.info('Waiting for %d mons in monmap...' % (len(nodes)))
+                    r = _shell(
+                        ctx=ctx,
+                        cluster_name=cluster_name,
+                        remote=remote,
+                        args=[
+                            'ceph', 'mon', 'dump', '-f', 'json',
+                        ],
+                        stdout=StringIO(),
+                    )
+                    j = json.loads(r.stdout.getvalue())
+                    if len(j['mons']) == len(nodes):
+                        break
+
+        # refresh our (final) ceph.conf file
+        bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
+        log.info('Generating final ceph.conf file...')
+        r = _shell(
+            ctx=ctx,
+            cluster_name=cluster_name,
+            remote=bootstrap_remote,
+            args=[
+                'ceph', 'config', 'generate-minimal-conf',
+            ],
+            stdout=StringIO(),
+        )
+        ctx.ceph[cluster_name].config_file = r.stdout.getvalue()
+
+        yield
+
+    finally:
+        pass
+
+
+@contextlib.contextmanager
+def ceph_mgrs(ctx, config):
+    """
+    Deploy any additional mgrs
+    """
+    cluster_name = config['cluster']
+    fsid = ctx.ceph[cluster_name].fsid
+
+    try:
+        nodes = []
+        daemons = {}
+        for remote, roles in ctx.cluster.remotes.items():
+            for mgr in [r for r in roles
+                        if teuthology.is_type('mgr', cluster_name)(r)]:
+                c_, _, id_ = teuthology.split_role(mgr)
+                log.info('Adding %s on %s' % (mgr, remote.shortname))
+                nodes.append(remote.shortname + '=' + id_)
+                if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mgr:
+                    continue
+                daemons[mgr] = (remote, id_)
+        if nodes:
+            _shell(ctx, cluster_name, remote, [
+                'ceph', 'orch', 'apply', 'mgr',
+                str(len(nodes)) + ';' + ';'.join(nodes)]
+            )
+        for mgr, i in daemons.items():
+            remote, id_ = i
+            ctx.daemons.register_daemon(
+                remote, 'mgr', id_,
+                cluster=cluster_name,
+                fsid=fsid,
+                logger=log.getChild(mgr),
+                wait=False,
+                started=True,
+            )
+
+        yield
+
+    finally:
+        pass
+
+
+@contextlib.contextmanager
+def ceph_osds(ctx, config):
+    """
+    Deploy OSDs
+    """
+    cluster_name = config['cluster']
+    fsid = ctx.ceph[cluster_name].fsid
+
+    try:
+        log.info('Deploying OSDs...')
+
+        # provision OSDs in numeric order
+        id_to_remote = {}
+        devs_by_remote = {}
+        for remote, roles in ctx.cluster.remotes.items():
+            devs_by_remote[remote] = teuthology.get_scratch_devices(remote)
+            for osd in [r for r in roles
+                        if teuthology.is_type('osd', cluster_name)(r)]:
+                _, _, id_ = teuthology.split_role(osd)
+                id_to_remote[int(id_)] = (osd, remote)
+
+        cur = 0
+        for osd_id in sorted(id_to_remote.keys()):
+            osd, remote = id_to_remote[osd_id]
+            _, _, id_ = teuthology.split_role(osd)
+            assert int(id_) == cur
+            devs = devs_by_remote[remote]
+            assert devs   ## FIXME ##
+            dev = devs.pop()
+            if all(_ in dev for _ in ('lv', 'vg')):
+                short_dev = dev.replace('/dev/', '')
+            else:
+                short_dev = dev
+            log.info('Deploying %s on %s with %s...' % (
+                osd, remote.shortname, dev))
+            _shell(ctx, cluster_name, remote, [
+                'ceph-volume', 'lvm', 'zap', dev])
+            _shell(ctx, cluster_name, remote, [
+                'ceph', 'orch', 'daemon', 'add', 'osd',
+                remote.shortname + ':' + short_dev
+            ])
+            ctx.daemons.register_daemon(
+                remote, 'osd', id_,
+                cluster=cluster_name,
+                fsid=fsid,
+                logger=log.getChild(osd),
+                wait=False,
+                started=True,
+            )
+            cur += 1
+
+        if cur == 0:
+            _shell(ctx, cluster_name, remote, [
+                'ceph', 'orch', 'apply', 'osd', '--all-available-devices',
+            ])
+            # expect the number of scratch devs
+            num_osds = sum(map(len, devs_by_remote.values()))
+            assert num_osds
+        else:
+            # expect the number of OSDs we created
+            num_osds = cur
+
+        log.info(f'Waiting for {num_osds} OSDs to come up...')
+        with contextutil.safe_while(sleep=1, tries=120) as proceed:
+            while proceed():
+                p = _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
+                           ['ceph', 'osd', 'stat', '-f', 'json'], stdout=StringIO())
+                j = json.loads(p.stdout.getvalue())
+                if int(j.get('num_up_osds', 0)) == num_osds:
+                    break;
+
+        yield
+    finally:
+        pass
+
+
+@contextlib.contextmanager
+def ceph_mdss(ctx, config):
+    """
+    Deploy MDSss
+    """
+    cluster_name = config['cluster']
+    fsid = ctx.ceph[cluster_name].fsid
+
+    nodes = []
+    daemons = {}
+    for remote, roles in ctx.cluster.remotes.items():
+        for role in [r for r in roles
+                    if teuthology.is_type('mds', cluster_name)(r)]:
+            c_, _, id_ = teuthology.split_role(role)
+            log.info('Adding %s on %s' % (role, remote.shortname))
+            nodes.append(remote.shortname + '=' + id_)
+            daemons[role] = (remote, id_)
+    if nodes:
+        _shell(ctx, cluster_name, remote, [
+            'ceph', 'orch', 'apply', 'mds',
+            'all',
+            str(len(nodes)) + ';' + ';'.join(nodes)]
+        )
+    for role, i in daemons.items():
+        remote, id_ = i
+        ctx.daemons.register_daemon(
+            remote, 'mds', id_,
+            cluster=cluster_name,
+            fsid=fsid,
+            logger=log.getChild(role),
+            wait=False,
+            started=True,
+        )
+
+    yield
+
+
+@contextlib.contextmanager
+def ceph_monitoring(daemon_type, ctx, config):
+    """
+    Deploy prometheus, node-exporter, etc.
+    """
+    cluster_name = config['cluster']
+    fsid = ctx.ceph[cluster_name].fsid
+
+    nodes = []
+    daemons = {}
+    for remote, roles in ctx.cluster.remotes.items():
+        for role in [r for r in roles
+                    if teuthology.is_type(daemon_type, cluster_name)(r)]:
+            c_, _, id_ = teuthology.split_role(role)
+            log.info('Adding %s on %s' % (role, remote.shortname))
+            nodes.append(remote.shortname + '=' + id_)
+            daemons[role] = (remote, id_)
+    if nodes:
+        _shell(ctx, cluster_name, remote, [
+            'ceph', 'orch', 'apply', daemon_type,
+            str(len(nodes)) + ';' + ';'.join(nodes)]
+        )
+    for role, i in daemons.items():
+        remote, id_ = i
+        ctx.daemons.register_daemon(
+            remote, daemon_type, id_,
+            cluster=cluster_name,
+            fsid=fsid,
+            logger=log.getChild(role),
+            wait=False,
+            started=True,
+        )
+
+    yield
+
+
+@contextlib.contextmanager
+def ceph_rgw(ctx, config):
+    """
+    Deploy rgw
+    """
+    cluster_name = config['cluster']
+    fsid = ctx.ceph[cluster_name].fsid
+
+    nodes = {}
+    daemons = {}
+    for remote, roles in ctx.cluster.remotes.items():
+        for role in [r for r in roles
+                    if teuthology.is_type('rgw', cluster_name)(r)]:
+            c_, _, id_ = teuthology.split_role(role)
+            log.info('Adding %s on %s' % (role, remote.shortname))
+            svc = '.'.join(id_.split('.')[0:2])
+            if svc not in nodes:
+                nodes[svc] = []
+            nodes[svc].append(remote.shortname + '=' + id_)
+            daemons[role] = (remote, id_)
+
+    for svc, nodes in nodes.items():
+        _shell(ctx, cluster_name, remote, [
+            'ceph', 'orch', 'apply', 'rgw', svc,
+             '--placement',
+             str(len(nodes)) + ';' + ';'.join(nodes)]
+        )
+    for role, i in daemons.items():
+        remote, id_ = i
+        ctx.daemons.register_daemon(
+            remote, 'rgw', id_,
+            cluster=cluster_name,
+            fsid=fsid,
+            logger=log.getChild(role),
+            wait=False,
+            started=True,
+        )
+
+    yield
+
+
+@contextlib.contextmanager
+def ceph_iscsi(ctx, config):
+    """
+    Deploy iSCSIs
+    """
+    cluster_name = config['cluster']
+    fsid = ctx.ceph[cluster_name].fsid
+
+    nodes = []
+    daemons = {}
+    for remote, roles in ctx.cluster.remotes.items():
+        for role in [r for r in roles
+                    if teuthology.is_type('iscsi', cluster_name)(r)]:
+            c_, _, id_ = teuthology.split_role(role)
+            log.info('Adding %s on %s' % (role, remote.shortname))
+            nodes.append(remote.shortname + '=' + id_)
+            daemons[role] = (remote, id_)
+    if nodes:
+        poolname = 'iscsi'
+        # ceph osd pool create iscsi 3 3 replicated
+        _shell(ctx, cluster_name, remote, [
+            'ceph', 'osd', 'pool', 'create',
+            poolname, '3', '3', 'replicated']
+        )
+
+        _shell(ctx, cluster_name, remote, [
+            'ceph', 'osd', 'pool', 'application', 'enable',
+            poolname, 'rbd']
+        )
+
+        # ceph orch apply iscsi iscsi user password
+        _shell(ctx, cluster_name, remote, [
+            'ceph', 'orch', 'apply', 'iscsi',
+            poolname, 'user', 'password',
+            '--placement', str(len(nodes)) + ';' + ';'.join(nodes)]
+        )
+    for role, i in daemons.items():
+        remote, id_ = i
+        ctx.daemons.register_daemon(
+            remote, 'iscsi', id_,
+            cluster=cluster_name,
+            fsid=fsid,
+            logger=log.getChild(role),
+            wait=False,
+            started=True,
+        )
+
+    yield
+
+
+@contextlib.contextmanager
+def ceph_clients(ctx, config):
+    cluster_name = config['cluster']
+
+    log.info('Setting up client nodes...')
+    clients = ctx.cluster.only(teuthology.is_type('client', cluster_name))
+    for remote, roles_for_host in clients.remotes.items():
+        for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
+                                                     cluster_name):
+            name = teuthology.ceph_role(role)
+            client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name,
+                                                                name)
+            r = _shell(
+                ctx=ctx,
+                cluster_name=cluster_name,
+                remote=remote,
+                args=[
+                    'ceph', 'auth',
+                    'get-or-create', name,
+                    'mon', 'allow *',
+                    'osd', 'allow *',
+                    'mds', 'allow *',
+                    'mgr', 'allow *',
+                ],
+                stdout=StringIO(),
+            )
+            keyring = r.stdout.getvalue()
+            remote.sudo_write_file(client_keyring, keyring, mode='0644')
+    yield
+
+
+@contextlib.contextmanager
+def ceph_initial():
+    try:
+        yield
+    finally:
+        log.info('Teardown complete')
+
+
+## public methods
+@contextlib.contextmanager
+def stop(ctx, config):
+    """
+    Stop ceph daemons
+
+    For example::
+      tasks:
+      - ceph.stop: [mds.*]
+
+      tasks:
+      - ceph.stop: [osd.0, osd.2]
+
+      tasks:
+      - ceph.stop:
+          daemons: [osd.0, osd.2]
+
+    """
+    if config is None:
+        config = {}
+    elif isinstance(config, list):
+        config = {'daemons': config}
+
+    daemons = ctx.daemons.resolve_role_list(
+        config.get('daemons', None), CEPH_ROLE_TYPES, True)
+    clusters = set()
+
+    for role in daemons:
+        cluster, type_, id_ = teuthology.split_role(role)
+        ctx.daemons.get_daemon(type_, id_, cluster).stop()
+        clusters.add(cluster)
+
+#    for cluster in clusters:
+#        ctx.ceph[cluster].watchdog.stop()
+#        ctx.ceph[cluster].watchdog.join()
+
+    yield
+
+
+def shell(ctx, config):
+    """
+    Execute (shell) commands
+    """
+    cluster_name = config.get('cluster', 'ceph')
+
+    args = []
+    for k in config.pop('env', []):
+        args.extend(['-e', k + '=' + ctx.config.get(k, '')])
+    for k in config.pop('volumes', []):
+        args.extend(['-v', k])
+
+    if 'all-roles' in config and len(config) == 1:
+        a = config['all-roles']
+        roles = teuthology.all_roles(ctx.cluster)
+        config = dict((id_, a) for id_ in roles if not id_.startswith('host.'))
+    elif 'all-hosts' in config and len(config) == 1:
+        a = config['all-hosts']
+        roles = teuthology.all_roles(ctx.cluster)
+        config = dict((id_, a) for id_ in roles if id_.startswith('host.'))
+
+    for role, cmd in config.items():
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+        log.info('Running commands on role %s host %s', role, remote.name)
+        if isinstance(cmd, list):
+            for c in cmd:
+                _shell(ctx, cluster_name, remote,
+                       ['bash', '-c', subst_vip(ctx, c)],
+                       extra_cephadm_args=args)
+        else:
+            assert isinstance(cmd, str)
+            _shell(ctx, cluster_name, remote,
+                   ['bash', '-ex', '-c', subst_vip(ctx, cmd)],
+                   extra_cephadm_args=args)
+
+
+def apply(ctx, config):
+    """
+    Apply spec
+    
+      tasks:
+        - cephadm.apply:
+            specs:
+            - service_type: rgw
+              service_id: foo
+              spec:
+                rgw_frontend_port: 8000
+            - service_type: rgw
+              service_id: bar
+              spec:
+                rgw_frontend_port: 9000
+                zone: bar
+                realm: asdf
+
+    """
+    cluster_name = config.get('cluster', 'ceph')
+
+    specs = config.get('specs', [])
+    y = subst_vip(ctx, yaml.dump_all(specs))
+
+    log.info(f'Applying spec(s):\n{y}')
+    _shell(
+        ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
+        ['ceph', 'orch', 'apply', '-i', '-'],
+        stdin=y,
+    )
+
+
+def wait_for_service(ctx, config):
+    """
+    Wait for a service to be fully started
+
+      tasks:
+        - cephadm.wait_for_service:
+            service: rgw.foo
+            timeout: 60    # defaults to 300
+
+    """
+    cluster_name = config.get('cluster', 'ceph')
+    timeout = config.get('timeout', 300)
+    service = config.get('service')
+    assert service
+
+    log.info(
+        f'Waiting for {cluster_name} service {service} to start (timeout {timeout})...'
+    )
+    with contextutil.safe_while(sleep=1, tries=timeout) as proceed:
+        while proceed():
+            r = _shell(
+                ctx=ctx,
+                cluster_name=cluster_name,
+                remote=ctx.ceph[cluster_name].bootstrap_remote,
+                args=[
+                    'ceph', 'orch', 'ls', '-f', 'json',
+                ],
+                stdout=StringIO(),
+            )
+            j = json.loads(r.stdout.getvalue())
+            svc = None
+            for s in j:
+                if s['service_name'] == service:
+                    svc = s
+                    break
+            if svc:
+                log.info(
+                    f"{service} has {s['status']['running']}/{s['status']['size']}"
+                )
+                if s['status']['running'] == s['status']['size']:
+                    break
+
+
+@contextlib.contextmanager
+def tweaked_option(ctx, config):
+    """
+    set an option, and then restore it with its original value
+
+    Note, due to the way how tasks are executed/nested, it's not suggested to
+    use this method as a standalone task. otherwise, it's likely that it will
+    restore the tweaked option at the /end/ of 'tasks' block.
+    """
+    saved_options = {}
+    # we can complicate this when necessary
+    options = ['mon-health-to-clog']
+    type_, id_ = 'mon', '*'
+    cluster = config.get('cluster', 'ceph')
+    manager = ctx.managers[cluster]
+    if id_ == '*':
+        get_from = next(teuthology.all_roles_of_type(ctx.cluster, type_))
+    else:
+        get_from = id_
+    for option in options:
+        if option not in config:
+            continue
+        value = 'true' if config[option] else 'false'
+        option = option.replace('-', '_')
+        old_value = manager.get_config(type_, get_from, option)
+        if value != old_value:
+            saved_options[option] = old_value
+            manager.inject_args(type_, id_, option, value)
+    yield
+    for option, value in saved_options.items():
+        manager.inject_args(type_, id_, option, value)
+
+
+@contextlib.contextmanager
+def restart(ctx, config):
+    """
+   restart ceph daemons
+
+   For example::
+      tasks:
+      - ceph.restart: [all]
+
+   For example::
+      tasks:
+      - ceph.restart: [osd.0, mon.1, mds.*]
+
+   or::
+
+      tasks:
+      - ceph.restart:
+          daemons: [osd.0, mon.1]
+          wait-for-healthy: false
+          wait-for-osds-up: true
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    if config is None:
+        config = {}
+    elif isinstance(config, list):
+        config = {'daemons': config}
+
+    daemons = ctx.daemons.resolve_role_list(
+        config.get('daemons', None), CEPH_ROLE_TYPES, True)
+    clusters = set()
+
+    log.info('daemons %s' % daemons)
+    with tweaked_option(ctx, config):
+        for role in daemons:
+            cluster, type_, id_ = teuthology.split_role(role)
+            d = ctx.daemons.get_daemon(type_, id_, cluster)
+            assert d, 'daemon %s does not exist' % role
+            d.stop()
+            if type_ == 'osd':
+                ctx.managers[cluster].mark_down_osd(id_)
+            d.restart()
+            clusters.add(cluster)
+
+    if config.get('wait-for-healthy', True):
+        for cluster in clusters:
+            healthy(ctx=ctx, config=dict(cluster=cluster))
+    if config.get('wait-for-osds-up', False):
+        for cluster in clusters:
+            ctx.managers[cluster].wait_for_all_osds_up()
+    yield
+
+
+@contextlib.contextmanager
+def distribute_config_and_admin_keyring(ctx, config):
+    """
+    Distribute a sufficient config and keyring for clients
+    """
+    cluster_name = config['cluster']
+    log.info('Distributing (final) config and client.admin keyring...')
+    for remote, roles in ctx.cluster.remotes.items():
+        remote.write_file(
+            '/etc/ceph/{}.conf'.format(cluster_name),
+            ctx.ceph[cluster_name].config_file,
+            sudo=True)
+        remote.write_file(
+            path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
+            data=ctx.ceph[cluster_name].admin_keyring,
+            sudo=True)
+    try:
+        yield
+    finally:
+        ctx.cluster.run(args=[
+            'sudo', 'rm', '-f',
+            '/etc/ceph/{}.conf'.format(cluster_name),
+            '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
+        ])
+
+
+@contextlib.contextmanager
+def crush_setup(ctx, config):
+    cluster_name = config['cluster']
+
+    profile = config.get('crush_tunables', 'default')
+    log.info('Setting crush tunables to %s', profile)
+    _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
+        args=['ceph', 'osd', 'crush', 'tunables', profile])
+    yield
+
+
+@contextlib.contextmanager
+def create_rbd_pool(ctx, config):
+    if config.get('create_rbd_pool', False):
+      cluster_name = config['cluster']
+      log.info('Waiting for OSDs to come up')
+      teuthology.wait_until_osds_up(
+          ctx,
+          cluster=ctx.cluster,
+          remote=ctx.ceph[cluster_name].bootstrap_remote,
+          ceph_cluster=cluster_name,
+      )
+      log.info('Creating RBD pool')
+      _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
+          args=['sudo', 'ceph', '--cluster', cluster_name,
+                'osd', 'pool', 'create', 'rbd', '8'])
+      _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
+          args=['sudo', 'ceph', '--cluster', cluster_name,
+                'osd', 'pool', 'application', 'enable',
+                'rbd', 'rbd', '--yes-i-really-mean-it'
+          ])
+    yield
+
+
+@contextlib.contextmanager
+def _bypass():
+    yield
+
+
+@contextlib.contextmanager
+def initialize_config(ctx, config):
+    cluster_name = config['cluster']
+    testdir = teuthology.get_testdir(ctx)
+
+    ctx.ceph[cluster_name].thrashers = []
+    # fixme: setup watchdog, ala ceph.py
+
+    ctx.ceph[cluster_name].roleless = False  # see below
+
+    first_ceph_cluster = False
+    if not hasattr(ctx, 'daemons'):
+        first_ceph_cluster = True
+
+    # cephadm mode?
+    if 'cephadm_mode' not in config:
+        config['cephadm_mode'] = 'root'
+    assert config['cephadm_mode'] in ['root', 'cephadm-package']
+    if config['cephadm_mode'] == 'root':
+        ctx.cephadm = testdir + '/cephadm'
+    else:
+        ctx.cephadm = 'cephadm'  # in the path
+
+    if first_ceph_cluster:
+        # FIXME: this is global for all clusters
+        ctx.daemons = DaemonGroup(
+            use_cephadm=ctx.cephadm)
+
+    # uuid
+    fsid = str(uuid.uuid1())
+    log.info('Cluster fsid is %s' % fsid)
+    ctx.ceph[cluster_name].fsid = fsid
+
+    # mon ips
+    log.info('Choosing monitor IPs and ports...')
+    remotes_and_roles = ctx.cluster.remotes.items()
+    ips = [host for (host, port) in
+           (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
+
+    if config.get('roleless', False):
+        # mons will be named after hosts
+        first_mon = None
+        for remote, _ in remotes_and_roles:
+            ctx.cluster.remotes[remote].append('mon.' + remote.shortname)
+            if not first_mon:
+                first_mon = remote.shortname
+                bootstrap_remote = remote
+        log.info('No mon roles; fabricating mons')
+
+    roles = [role_list for (remote, role_list) in ctx.cluster.remotes.items()]
+
+    ctx.ceph[cluster_name].mons = get_mons(
+        roles, ips, cluster_name,
+        mon_bind_msgr2=config.get('mon_bind_msgr2', True),
+        mon_bind_addrvec=config.get('mon_bind_addrvec', True),
+    )
+    log.info('Monitor IPs: %s' % ctx.ceph[cluster_name].mons)
+
+    if config.get('roleless', False):
+        ctx.ceph[cluster_name].roleless = True
+        ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
+        ctx.ceph[cluster_name].first_mon = first_mon
+        ctx.ceph[cluster_name].first_mon_role = 'mon.' + first_mon
+    else:
+        first_mon_role = sorted(ctx.ceph[cluster_name].mons.keys())[0]
+        _, _, first_mon = teuthology.split_role(first_mon_role)
+        (bootstrap_remote,) = ctx.cluster.only(first_mon_role).remotes.keys()
+        log.info('First mon is mon.%s on %s' % (first_mon,
+                                                bootstrap_remote.shortname))
+        ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
+        ctx.ceph[cluster_name].first_mon = first_mon
+        ctx.ceph[cluster_name].first_mon_role = first_mon_role
+
+        others = ctx.cluster.remotes[bootstrap_remote]
+        mgrs = sorted([r for r in others
+                       if teuthology.is_type('mgr', cluster_name)(r)])
+        if not mgrs:
+            raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon)
+        _, _, first_mgr = teuthology.split_role(mgrs[0])
+        log.info('First mgr is %s' % (first_mgr))
+        ctx.ceph[cluster_name].first_mgr = first_mgr
+    yield
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Deploy ceph cluster using cephadm
+
+    For example, teuthology.yaml can contain the 'defaults' section:
+
+        defaults:
+          cephadm:
+            containers:
+              image: 'quay.io/ceph-ci/ceph'
+
+    Using overrides makes it possible to customize it per run.
+    The equivalent 'overrides' section looks like:
+
+        overrides:
+          cephadm:
+            containers:
+              image: 'quay.io/ceph-ci/ceph'
+            registry-login:
+              url:  registry-url
+              username: registry-user
+              password: registry-password
+
+    :param ctx: the argparse.Namespace object
+    :param config: the config dict
+    """
+    if config is None:
+        config = {}
+
+    assert isinstance(config, dict), \
+        "task only supports a dictionary for configuration"
+
+    overrides = ctx.config.get('overrides', {})
+    teuthology.deep_merge(config, overrides.get('ceph', {}))
+    teuthology.deep_merge(config, overrides.get('cephadm', {}))
+    log.info('Config: ' + str(config))
+
+    # set up cluster context
+    if not hasattr(ctx, 'ceph'):
+        ctx.ceph = {}
+    if 'cluster' not in config:
+        config['cluster'] = 'ceph'
+    cluster_name = config['cluster']
+    if cluster_name not in ctx.ceph:
+        ctx.ceph[cluster_name] = argparse.Namespace()
+        ctx.ceph[cluster_name].bootstrapped = False
+
+    # image
+    teuth_defaults = teuth_config.get('defaults', {})
+    cephadm_defaults = teuth_defaults.get('cephadm', {})
+    containers_defaults = cephadm_defaults.get('containers', {})
+    container_image_name = containers_defaults.get('image', None)
+
+    containers = config.get('containers', {})
+    container_image_name = containers.get('image', container_image_name)
+
+    if not hasattr(ctx.ceph[cluster_name], 'image'):
+        ctx.ceph[cluster_name].image = config.get('image')
+    ref = None
+    if not ctx.ceph[cluster_name].image:
+        if not container_image_name:
+            raise Exception("Configuration error occurred. "
+                            "The 'image' value is undefined for 'cephadm' task. "
+                            "Please provide corresponding options in the task's "
+                            "config, task 'overrides', or teuthology 'defaults' "
+                            "section.")
+        sha1 = config.get('sha1')
+        flavor = config.get('flavor', 'default')
+
+        if sha1:
+            if flavor == "crimson":
+                ctx.ceph[cluster_name].image = container_image_name + ':' + sha1 + '-' + flavor
+            else:
+                ctx.ceph[cluster_name].image = container_image_name + ':' + sha1
+            ref = sha1
+        else:
+            # hmm, fall back to branch?
+            branch = config.get('branch', 'master')
+            ref = branch
+            ctx.ceph[cluster_name].image = container_image_name + ':' + branch
+    log.info('Cluster image is %s' % ctx.ceph[cluster_name].image)
+
+
+    with contextutil.nested(
+            #if the cluster is already bootstrapped bypass corresponding methods
+            lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
+                              else initialize_config(ctx=ctx, config=config),
+            lambda: ceph_initial(),
+            lambda: normalize_hostnames(ctx=ctx),
+            lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
+                              else download_cephadm(ctx=ctx, config=config, ref=ref),
+            lambda: ceph_log(ctx=ctx, config=config),
+            lambda: ceph_crash(ctx=ctx, config=config),
+            lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
+                              else ceph_bootstrap(ctx, config),
+            lambda: crush_setup(ctx=ctx, config=config),
+            lambda: ceph_mons(ctx=ctx, config=config),
+            lambda: distribute_config_and_admin_keyring(ctx=ctx, config=config),
+            lambda: ceph_mgrs(ctx=ctx, config=config),
+            lambda: ceph_osds(ctx=ctx, config=config),
+            lambda: ceph_mdss(ctx=ctx, config=config),
+            lambda: ceph_rgw(ctx=ctx, config=config),
+            lambda: ceph_iscsi(ctx=ctx, config=config),
+            lambda: ceph_monitoring('prometheus', ctx=ctx, config=config),
+            lambda: ceph_monitoring('node-exporter', ctx=ctx, config=config),
+            lambda: ceph_monitoring('alertmanager', ctx=ctx, config=config),
+            lambda: ceph_monitoring('grafana', ctx=ctx, config=config),
+            lambda: ceph_clients(ctx=ctx, config=config),
+            lambda: create_rbd_pool(ctx=ctx, config=config),
+    ):
+        if not hasattr(ctx, 'managers'):
+            ctx.managers = {}
+        ctx.managers[cluster_name] = CephManager(
+            ctx.ceph[cluster_name].bootstrap_remote,
+            ctx=ctx,
+            logger=log.getChild('ceph_manager.' + cluster_name),
+            cluster=cluster_name,
+            cephadm=True,
+        )
+
+        try:
+            if config.get('wait-for-healthy', True):
+                healthy(ctx=ctx, config=config)
+
+            log.info('Setup complete, yielding')
+            yield
+
+        finally:
+            log.info('Teardown begin')
+
diff --git a/qa/tasks/cephadm_cases/__init__.py b/qa/tasks/cephadm_cases/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/qa/tasks/cephadm_cases/__init__.py
diff --git a/qa/tasks/cephadm_cases/test_cli.py b/qa/tasks/cephadm_cases/test_cli.py
new file mode 100644
index 000000000..c4cab4901
--- /dev/null
+++ b/qa/tasks/cephadm_cases/test_cli.py
@@ -0,0 +1,63 @@
+import logging
+
+from tasks.mgr.mgr_test_case import MgrTestCase
+
+log = logging.getLogger(__name__)
+
+
+class TestCephadmCLI(MgrTestCase):
+    def _cmd(self, *args) -> str:
+        assert self.mgr_cluster is not None
+        return self.mgr_cluster.mon_manager.raw_cluster_cmd(*args)
+
+    def _orch_cmd(self, *args) -> str:
+        return self._cmd("orch", *args)
+
+    def setUp(self):
+        super(TestCephadmCLI, self).setUp()
+
+    def test_yaml(self):
+        """
+        to prevent oddities like
+
+        >>> import yaml
+        ... from collections import OrderedDict
+        ... assert yaml.dump(OrderedDict()) == '!!python/object/apply:collections.OrderedDict\\n- []\\n'
+        """
+        out = self._orch_cmd('device', 'ls', '--format', 'yaml')
+        self.assertNotIn('!!python', out)
+
+        out = self._orch_cmd('host', 'ls', '--format', 'yaml')
+        self.assertNotIn('!!python', out)
+
+        out = self._orch_cmd('ls', '--format', 'yaml')
+        self.assertNotIn('!!python', out)
+
+        out = self._orch_cmd('ps', '--format', 'yaml')
+        self.assertNotIn('!!python', out)
+
+        out = self._orch_cmd('status', '--format', 'yaml')
+        self.assertNotIn('!!python', out)
+
+    def test_pause(self):
+        self._orch_cmd('pause')
+        self.wait_for_health('CEPHADM_PAUSED', 60)
+        self._orch_cmd('resume')
+        self.wait_for_health_clear(60)
+
+    def test_daemon_restart(self):
+        self._orch_cmd('daemon', 'stop', 'osd.0')
+        self.wait_for_health('OSD_DOWN', 60)
+        self._orch_cmd('daemon', 'start', 'osd.0')
+        self.wait_for_health_clear(120)
+        self._orch_cmd('daemon', 'restart', 'osd.0')
+
+    def test_device_ls_wide(self):
+        self._orch_cmd('device', 'ls', '--wide')
+
+    def test_cephfs_mirror(self):
+        self._orch_cmd('apply', 'cephfs-mirror')
+        self.wait_until_true(lambda: 'cephfs-mirror' in self._orch_cmd('ps'), 60)
+        self.wait_for_health_clear(60)
+        self._orch_cmd('rm', 'cephfs-mirror')
+        self.wait_until_true(lambda: 'cephfs-mirror' not in self._orch_cmd('ps'), 60)
diff --git a/qa/tasks/cephadm_cases/test_cli_mon.py b/qa/tasks/cephadm_cases/test_cli_mon.py
new file mode 100644
index 000000000..72aee094e
--- /dev/null
+++ b/qa/tasks/cephadm_cases/test_cli_mon.py
@@ -0,0 +1,71 @@
+import json
+import logging
+
+from tasks.mgr.mgr_test_case import MgrTestCase
+
+log = logging.getLogger(__name__)
+
+
+class TestCephadmCLI(MgrTestCase):
+
+    APPLY_MON_PERIOD = 60
+
+    def _cmd(self, *args) -> str:
+        assert self.mgr_cluster is not None
+        return self.mgr_cluster.mon_manager.raw_cluster_cmd(*args)
+
+    def _orch_cmd(self, *args) -> str:
+        return self._cmd("orch", *args)
+
+    def setUp(self):
+        super(TestCephadmCLI, self).setUp()
+
+    def _create_and_write_pool(self, pool_name):
+        # Create new pool and write to it, simulating a small workload.
+        self.mgr_cluster.mon_manager.create_pool(pool_name)
+        args = [
+            "rados", "-p", pool_name, "bench", "30", "write", "-t", "16"]
+        self.mgr_cluster.admin_remote.run(args=args, wait=True)
+    
+    def _get_quorum_size(self) -> int:
+        # Evaluate if the quorum size of the cluster is correct.
+        # log the quorum_status before reducing the monitors
+        retstr = self._cmd('quorum_status')
+        log.info("test_apply_mon._check_quorum_size: %s" % json.dumps(retstr, indent=2))
+        quorum_size = len(json.loads(retstr)['quorum']) # get quorum size
+        return quorum_size
+
+    def _check_no_crashes(self):
+        # Evaluate if there are no crashes
+        # log the crash
+        retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'crash', 'ls',
+        )
+        log.info("test_apply_mon._check_no_crashes: %s" % retstr)
+        self.assertEqual(0, len(retstr)) # check if there are no crashes
+
+    def test_apply_mon_three(self):
+        # Evaluating the process of reducing the number of 
+        # monitors from 5 to 3 and increasing the number of
+        # monitors from 3 to 5, using the `ceph orch apply mon <num>` command.
+
+        self.wait_until_equal(lambda : self._get_quorum_size(), 5,
+                      timeout=self.APPLY_MON_PERIOD, period=10)
+
+        self._orch_cmd('apply', 'mon', '3') # reduce the monitors from 5 -> 3
+
+        self._create_and_write_pool('test_pool1')
+
+        self.wait_until_equal(lambda : self._get_quorum_size(), 3,
+                      timeout=self.APPLY_MON_PERIOD, period=10)
+
+        self._check_no_crashes()
+
+        self._orch_cmd('apply', 'mon', '5') # increase the monitors from 3 -> 5
+
+        self._create_and_write_pool('test_pool2')
+
+        self.wait_until_equal(lambda : self._get_quorum_size(), 5,
+                      timeout=self.APPLY_MON_PERIOD, period=10)
+
+        self._check_no_crashes()
+\ No newline at end of file
diff --git a/qa/tasks/cephfs/__init__.py b/qa/tasks/cephfs/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/qa/tasks/cephfs/__init__.py
diff --git a/qa/tasks/cephfs/caps_helper.py b/qa/tasks/cephfs/caps_helper.py
new file mode 100644
index 000000000..39b5963be
--- /dev/null
+++ b/qa/tasks/cephfs/caps_helper.py
@@ -0,0 +1,79 @@
+"""
+Helper methods to test that MON and MDS caps are enforced properly.
+"""
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+from teuthology.orchestra.run import Raw
+
+class CapsHelper(CephFSTestCase):
+
+    def run_mon_cap_tests(self, moncap, keyring):
+        keyring_path = self.create_keyring_file(self.fs.admin_remote, keyring)
+
+        fsls = self.run_cluster_cmd(f'fs ls --id {self.client_id} -k '
+                                    f'{keyring_path}')
+
+        # we need to check only for default FS when fsname clause is absent
+        # in MON/MDS caps
+        if 'fsname' not in moncap:
+            self.assertIn(self.fs.name, fsls)
+            return
+
+        fss = (self.fs1.name, self.fs2.name) if hasattr(self, 'fs1') else \
+            (self.fs.name,)
+        for fsname in fss:
+                if fsname in moncap:
+                    self.assertIn('name: ' + fsname, fsls)
+                else:
+                    self.assertNotIn('name: ' + fsname, fsls)
+
+    def run_mds_cap_tests(self, filepaths, filedata, mounts, perm):
+        self.conduct_pos_test_for_read_caps(filepaths, filedata, mounts)
+
+        if perm == 'rw':
+            self.conduct_pos_test_for_write_caps(filepaths, mounts)
+        elif perm == 'r':
+            self.conduct_neg_test_for_write_caps(filepaths, mounts)
+        else:
+            raise RuntimeError(f'perm = {perm}\nIt should be "r" or "rw".')
+
+    def conduct_pos_test_for_read_caps(self, filepaths, filedata, mounts):
+        for mount in mounts:
+            for path, data in zip(filepaths, filedata):
+                # XXX: conduct tests only if path belongs to current mount; in
+                # teuth tests client are located on same machines.
+                if path.find(mount.hostfs_mntpt) != -1:
+                    contents = mount.read_file(path)
+                    self.assertEqual(data, contents)
+
+    def conduct_pos_test_for_write_caps(self, filepaths, mounts):
+        filedata = ('some new data on first fs', 'some new data on second fs')
+
+        for mount in mounts:
+            for path, data in zip(filepaths, filedata):
+                if path.find(mount.hostfs_mntpt) != -1:
+                    # test that write was successful
+                    mount.write_file(path=path, data=data)
+                    # verify that contents written was same as the one that was
+                    # intended
+                    contents1 = mount.read_file(path=path)
+                    self.assertEqual(data, contents1)
+
+    def conduct_neg_test_for_write_caps(self, filepaths, mounts):
+        cmdargs = ['echo', 'some random data', Raw('|'), 'tee']
+
+        for mount in mounts:
+            for path in filepaths:
+                if path.find(mount.hostfs_mntpt) != -1:
+                    cmdargs.append(path)
+                    mount.negtestcmd(args=cmdargs, retval=1,
+                                     errmsg='permission denied')
+
+    def get_mon_cap_from_keyring(self, client_name):
+        keyring = self.run_cluster_cmd(cmd=f'auth get {client_name}')
+        for line in keyring.split('\n'):
+            if 'caps mon' in line:
+                return line[line.find(' = "') + 4 : -1]
+
+        raise RuntimeError('get_save_mon_cap: mon cap not found in keyring. '
+                           'keyring -\n' + keyring)
diff --git a/qa/tasks/cephfs/cephfs_test_case.py b/qa/tasks/cephfs/cephfs_test_case.py
new file mode 100644
index 000000000..41831dac6
--- /dev/null
+++ b/qa/tasks/cephfs/cephfs_test_case.py
@@ -0,0 +1,462 @@
+import json
+import logging
+import os
+import re
+
+from shlex import split as shlex_split
+
+from tasks.ceph_test_case import CephTestCase
+
+from teuthology import contextutil
+from teuthology.orchestra import run
+from teuthology.orchestra.run import CommandFailedError
+
+log = logging.getLogger(__name__)
+
+def for_teuthology(f):
+    """
+    Decorator that adds an "is_for_teuthology" attribute to the wrapped function
+    """
+    f.is_for_teuthology = True
+    return f
+
+
+def needs_trimming(f):
+    """
+    Mark fn as requiring a client capable of trimming its cache (i.e. for ceph-fuse
+    this means it needs to be able to run as root, currently)
+    """
+    f.needs_trimming = True
+    return f
+
+
+class MountDetails():
+
+    def __init__(self, mntobj):
+        self.client_id = mntobj.client_id
+        self.client_keyring_path = mntobj.client_keyring_path
+        self.client_remote = mntobj.client_remote
+        self.cephfs_name = mntobj.cephfs_name
+        self.cephfs_mntpt = mntobj.cephfs_mntpt
+        self.hostfs_mntpt = mntobj.hostfs_mntpt
+
+    def restore(self, mntobj):
+        mntobj.client_id = self.client_id
+        mntobj.client_keyring_path = self.client_keyring_path
+        mntobj.client_remote = self.client_remote
+        mntobj.cephfs_name = self.cephfs_name
+        mntobj.cephfs_mntpt = self.cephfs_mntpt
+        mntobj.hostfs_mntpt = self.hostfs_mntpt
+
+
+class CephFSTestCase(CephTestCase):
+    """
+    Test case for Ceph FS, requires caller to populate Filesystem and Mounts,
+    into the fs, mount_a, mount_b class attributes (setting mount_b is optional)
+
+    Handles resetting the cluster under test between tests.
+    """
+
+    # FIXME weird explicit naming
+    mount_a = None
+    mount_b = None
+    recovery_mount = None
+
+    # Declarative test requirements: subclasses should override these to indicate
+    # their special needs.  If not met, tests will be skipped.
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 1
+    REQUIRE_KCLIENT_REMOTE = False
+    REQUIRE_ONE_CLIENT_REMOTE = False
+
+    # Whether to create the default filesystem during setUp
+    REQUIRE_FILESYSTEM = True
+
+    # requires REQUIRE_FILESYSTEM = True
+    REQUIRE_RECOVERY_FILESYSTEM = False
+
+    # create a backup filesystem if required.
+    # required REQUIRE_FILESYSTEM enabled
+    REQUIRE_BACKUP_FILESYSTEM = False
+
+    LOAD_SETTINGS = [] # type: ignore
+
+    def _save_mount_details(self):
+        """
+        XXX: Tests may change details of mount objects, so let's stash them so
+        that these details are restored later to ensure smooth setUps and
+        tearDowns for upcoming tests.
+        """
+        self._orig_mount_details = [MountDetails(m) for m in self.mounts]
+        log.info(self._orig_mount_details)
+
+    def _remove_blocklist(self):
+        # In case anything is in the OSD blocklist list, clear it out.  This is to avoid
+        # the OSD map changing in the background (due to blocklist expiry) while tests run.
+        try:
+            self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blocklist", "clear")
+        except CommandFailedError:
+            # Fallback for older Ceph cluster
+            try:
+                blocklist = json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd("osd",
+                                      "dump", "--format=json-pretty"))['blocklist']
+                log.info(f"Removing {len(blocklist)} blocklist entries")
+                for addr, blocklisted_at in blocklist.items():
+                    self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blocklist", "rm", addr)
+            except KeyError:
+                # Fallback for more older Ceph clusters, who will use 'blacklist' instead.
+                blacklist = json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd("osd",
+                                      "dump", "--format=json-pretty"))['blacklist']
+                log.info(f"Removing {len(blacklist)} blacklist entries")
+                for addr, blocklisted_at in blacklist.items():
+                    self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "rm", addr)
+
+    def setUp(self):
+        super(CephFSTestCase, self).setUp()
+
+        self.config_set('mon', 'mon_allow_pool_delete', True)
+
+        if len(self.mds_cluster.mds_ids) < self.MDSS_REQUIRED:
+            self.skipTest("Only have {0} MDSs, require {1}".format(
+                len(self.mds_cluster.mds_ids), self.MDSS_REQUIRED
+            ))
+
+        if len(self.mounts) < self.CLIENTS_REQUIRED:
+            self.skipTest("Only have {0} clients, require {1}".format(
+                len(self.mounts), self.CLIENTS_REQUIRED
+            ))
+
+        if self.REQUIRE_ONE_CLIENT_REMOTE:
+            if self.mounts[0].client_remote.hostname in self.mds_cluster.get_mds_hostnames():
+                self.skipTest("Require first client to be on separate server from MDSs")
+
+        # Create friendly mount_a, mount_b attrs
+        for i in range(0, self.CLIENTS_REQUIRED):
+            setattr(self, "mount_{0}".format(chr(ord('a') + i)), self.mounts[i])
+
+        self.mds_cluster.clear_firewall()
+
+        # Unmount all clients, we are about to blow away the filesystem
+        for mount in self.mounts:
+            if mount.is_mounted():
+                mount.umount_wait(force=True)
+        self._save_mount_details()
+
+        # To avoid any issues with e.g. unlink bugs, we destroy and recreate
+        # the filesystem rather than just doing a rm -rf of files
+        self.mds_cluster.delete_all_filesystems()
+        self.mds_cluster.mds_restart() # to reset any run-time configs, etc.
+        self.fs = None # is now invalid!
+        self.backup_fs = None
+        self.recovery_fs = None
+
+        self._remove_blocklist()
+
+        client_mount_ids = [m.client_id for m in self.mounts]
+        # In case there were any extra auth identities around from a previous
+        # test, delete them
+        for entry in self.auth_list():
+            ent_type, ent_id = entry['entity'].split(".")
+            if ent_type == "client" and ent_id not in client_mount_ids and not (ent_id == "admin" or ent_id[:6] == 'mirror'):
+                self.mds_cluster.mon_manager.raw_cluster_cmd("auth", "del", entry['entity'])
+
+        if self.REQUIRE_FILESYSTEM:
+            self.fs = self.mds_cluster.newfs(create=True)
+
+            # In case some test messed with auth caps, reset them
+            for client_id in client_mount_ids:
+                cmd = ['auth', 'caps', f'client.{client_id}', 'mon','allow r',
+                       'osd', f'allow rw pool={self.fs.get_data_pool_name()}',
+                       'mds', 'allow']
+
+                if self.run_cluster_cmd_result(cmd) == 0:
+                    break
+
+                cmd[1] = 'add'
+                if self.run_cluster_cmd_result(cmd) != 0:
+                    raise RuntimeError(f'Failed to create new client {cmd[2]}')
+
+            # wait for ranks to become active
+            self.fs.wait_for_daemons()
+
+            # Mount the requested number of clients
+            for i in range(0, self.CLIENTS_REQUIRED):
+                self.mounts[i].mount_wait()
+
+        if self.REQUIRE_BACKUP_FILESYSTEM:
+            if not self.REQUIRE_FILESYSTEM:
+                self.skipTest("backup filesystem requires a primary filesystem as well")
+            self.fs.mon_manager.raw_cluster_cmd('fs', 'flag', 'set',
+                                                'enable_multiple', 'true',
+                                                '--yes-i-really-mean-it')
+            self.backup_fs = self.mds_cluster.newfs(name="backup_fs")
+            self.backup_fs.wait_for_daemons()
+
+        if self.REQUIRE_RECOVERY_FILESYSTEM:
+            if not self.REQUIRE_FILESYSTEM:
+                self.skipTest("Recovery filesystem requires a primary filesystem as well")
+            # After Octopus is EOL, we can remove this setting:
+            self.fs.mon_manager.raw_cluster_cmd('fs', 'flag', 'set',
+                                                'enable_multiple', 'true',
+                                                '--yes-i-really-mean-it')
+            self.recovery_fs = self.mds_cluster.newfs(name="recovery_fs", create=False)
+            self.recovery_fs.set_metadata_overlay(True)
+            self.recovery_fs.set_data_pool_name(self.fs.get_data_pool_name())
+            self.recovery_fs.create()
+            self.recovery_fs.getinfo(refresh=True)
+            self.recovery_fs.wait_for_daemons()
+
+        # Load an config settings of interest
+        for setting in self.LOAD_SETTINGS:
+            setattr(self, setting, float(self.fs.mds_asok(
+                ['config', 'get', setting], list(self.mds_cluster.mds_ids)[0]
+            )[setting]))
+
+        self.configs_set = set()
+
+    def tearDown(self):
+        self.mds_cluster.clear_firewall()
+        for m in self.mounts:
+            m.teardown()
+
+        # To prevent failover messages during Unwind of ceph task
+        self.mds_cluster.delete_all_filesystems()
+
+        for m, md in zip(self.mounts, self._orig_mount_details):
+            md.restore(m)
+
+        for subsys, key in self.configs_set:
+            self.mds_cluster.clear_ceph_conf(subsys, key)
+
+        return super(CephFSTestCase, self).tearDown()
+
+    def set_conf(self, subsys, key, value):
+        self.configs_set.add((subsys, key))
+        self.mds_cluster.set_ceph_conf(subsys, key, value)
+
+    def auth_list(self):
+        """
+        Convenience wrapper on "ceph auth ls"
+        """
+        return json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd(
+            "auth", "ls", "--format=json-pretty"
+        ))['auth_dump']
+
+    def assert_session_count(self, expected, ls_data=None, mds_id=None):
+        if ls_data is None:
+            ls_data = self.fs.mds_asok(['session', 'ls'], mds_id=mds_id)
+
+        alive_count = len([s for s in ls_data if s['state'] != 'killing'])
+
+        self.assertEqual(expected, alive_count, "Expected {0} sessions, found {1}".format(
+            expected, alive_count
+        ))
+
+    def assert_session_state(self, client_id,  expected_state):
+        self.assertEqual(
+            self._session_by_id(
+                self.fs.mds_asok(['session', 'ls'])).get(client_id, {'state': None})['state'],
+            expected_state)
+
+    def get_session_data(self, client_id):
+        return self._session_by_id(client_id)
+
+    def _session_list(self):
+        ls_data = self.fs.mds_asok(['session', 'ls'])
+        ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']]
+        return ls_data
+
+    def get_session(self, client_id, session_ls=None):
+        if session_ls is None:
+            session_ls = self.fs.mds_asok(['session', 'ls'])
+
+        return self._session_by_id(session_ls)[client_id]
+
+    def _session_by_id(self, session_ls):
+        return dict([(s['id'], s) for s in session_ls])
+
+    def perf_dump(self, rank=None, status=None):
+        return self.fs.rank_asok(['perf', 'dump'], rank=rank, status=status)
+
+    def wait_until_evicted(self, client_id, timeout=30):
+        def is_client_evicted():
+            ls = self._session_list()
+            for s in ls:
+                if s['id'] == client_id:
+                    return False
+            return True
+        self.wait_until_true(is_client_evicted, timeout)
+
+    def wait_for_daemon_start(self, daemon_ids=None):
+        """
+        Wait until all the daemons appear in the FSMap, either assigned
+        MDS ranks or in the list of standbys
+        """
+        def get_daemon_names():
+            return [info['name'] for info in self.mds_cluster.status().get_all()]
+
+        if daemon_ids is None:
+            daemon_ids = self.mds_cluster.mds_ids
+
+        try:
+            self.wait_until_true(
+                lambda: set(daemon_ids) & set(get_daemon_names()) == set(daemon_ids),
+                timeout=30
+            )
+        except RuntimeError:
+            log.warning("Timeout waiting for daemons {0}, while we have {1}".format(
+                daemon_ids, get_daemon_names()
+            ))
+            raise
+
+    def delete_mds_coredump(self, daemon_id):
+        # delete coredump file, otherwise teuthology.internal.coredump will
+        # catch it later and treat it as a failure.
+        core_pattern = self.mds_cluster.mds_daemons[daemon_id].remote.sh(
+            "sudo sysctl -n kernel.core_pattern")
+        core_dir = os.path.dirname(core_pattern.strip())
+        if core_dir:  # Non-default core_pattern with a directory in it
+            # We have seen a core_pattern that looks like it's from teuthology's coredump
+            # task, so proceed to clear out the core file
+            if core_dir[0] == '|':
+                log.info("Piped core dumps to program {0}, skip cleaning".format(core_dir[1:]))
+                return;
+
+            log.info("Clearing core from directory: {0}".format(core_dir))
+
+            # Verify that we see the expected single coredump
+            ls_output = self.mds_cluster.mds_daemons[daemon_id].remote.sh([
+                "cd", core_dir, run.Raw('&&'),
+                "sudo", "ls", run.Raw('|'), "sudo", "xargs", "file"
+            ])
+            cores = [l.partition(":")[0]
+                     for l in ls_output.strip().split("\n")
+                     if re.match(r'.*ceph-mds.* -i +{0}'.format(daemon_id), l)]
+
+            log.info("Enumerated cores: {0}".format(cores))
+            self.assertEqual(len(cores), 1)
+
+            log.info("Found core file {0}, deleting it".format(cores[0]))
+
+            self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
+                "cd", core_dir, run.Raw('&&'), "sudo", "rm", "-f", cores[0]
+            ])
+        else:
+            log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")
+
+    def _get_subtrees(self, status=None, rank=None, path=None):
+        if path is None:
+            path = "/"
+        try:
+            with contextutil.safe_while(sleep=1, tries=3) as proceed:
+                while proceed():
+                    try:
+                        if rank == "all":
+                            subtrees = []
+                            for r in self.fs.get_ranks(status=status):
+                                s = self.fs.rank_asok(["get", "subtrees"], status=status, rank=r['rank'])
+                                s = filter(lambda s: s['auth_first'] == r['rank'] and s['auth_second'] == -2, s)
+                                subtrees += s
+                        else:
+                            subtrees = self.fs.rank_asok(["get", "subtrees"], status=status, rank=rank)
+                        subtrees = filter(lambda s: s['dir']['path'].startswith(path), subtrees)
+                        return list(subtrees)
+                    except CommandFailedError as e:
+                        # Sometimes we get transient errors
+                        if e.exitstatus == 22:
+                            pass
+                        else:
+                            raise
+        except contextutil.MaxWhileTries as e:
+            raise RuntimeError(f"could not get subtree state from rank {rank}") from e
+
+    def _wait_subtrees(self, test, status=None, rank=None, timeout=30, sleep=2, action=None, path=None):
+        test = sorted(test)
+        try:
+            with contextutil.safe_while(sleep=sleep, tries=timeout//sleep) as proceed:
+                while proceed():
+                    subtrees = self._get_subtrees(status=status, rank=rank, path=path)
+                    filtered = sorted([(s['dir']['path'], s['auth_first']) for s in subtrees])
+                    log.info("%s =?= %s", filtered, test)
+                    if filtered == test:
+                        # Confirm export_pin in output is correct:
+                        for s in subtrees:
+                            if s['export_pin_target'] >= 0:
+                                self.assertTrue(s['export_pin_target'] == s['auth_first'])
+                        return subtrees
+                    if action is not None:
+                        action()
+        except contextutil.MaxWhileTries as e:
+            raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e
+
+    def _wait_until_scrub_complete(self, path="/", recursive=True, timeout=100):
+        out_json = self.fs.run_scrub(["start", path] + ["recursive"] if recursive else [])
+        if not self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"],
+                                                 sleep=10, timeout=timeout):
+            log.info("timed out waiting for scrub to complete")
+
+    def _wait_distributed_subtrees(self, count, status=None, rank=None, path=None):
+        try:
+            with contextutil.safe_while(sleep=5, tries=20) as proceed:
+                while proceed():
+                    subtrees = self._get_subtrees(status=status, rank=rank, path=path)
+                    subtrees = list(filter(lambda s: s['distributed_ephemeral_pin'] == True and
+                                                     s['auth_first'] == s['export_pin_target'],
+                                           subtrees))
+                    log.info(f"len={len(subtrees)} {subtrees}")
+                    if len(subtrees) >= count:
+                        return subtrees
+        except contextutil.MaxWhileTries as e:
+            raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e
+
+    def _wait_random_subtrees(self, count, status=None, rank=None, path=None):
+        try:
+            with contextutil.safe_while(sleep=5, tries=20) as proceed:
+                while proceed():
+                    subtrees = self._get_subtrees(status=status, rank=rank, path=path)
+                    subtrees = list(filter(lambda s: s['random_ephemeral_pin'] == True and
+                                                     s['auth_first'] == s['export_pin_target'],
+                                           subtrees))
+                    log.info(f"len={len(subtrees)} {subtrees}")
+                    if len(subtrees) >= count:
+                        return subtrees
+        except contextutil.MaxWhileTries as e:
+            raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e
+
+    def run_cluster_cmd(self, cmd):
+        if isinstance(cmd, str):
+            cmd = shlex_split(cmd)
+        return self.fs.mon_manager.raw_cluster_cmd(*cmd)
+
+    def run_cluster_cmd_result(self, cmd):
+        if isinstance(cmd, str):
+            cmd = shlex_split(cmd)
+        return self.fs.mon_manager.raw_cluster_cmd_result(*cmd)
+
+    def create_client(self, client_id, moncap=None, osdcap=None, mdscap=None):
+        if not (moncap or osdcap or mdscap):
+            if self.fs:
+                return self.fs.authorize(client_id, ('/', 'rw'))
+            else:
+                raise RuntimeError('no caps were passed and the default FS '
+                                   'is not created yet to allow client auth '
+                                   'for it.')
+
+        cmd = ['auth', 'add', f'client.{client_id}']
+        if moncap:
+            cmd += ['mon', moncap]
+        if osdcap:
+            cmd += ['osd', osdcap]
+        if mdscap:
+            cmd += ['mds', mdscap]
+
+        self.run_cluster_cmd(cmd)
+        return self.run_cluster_cmd(f'auth get {self.client_name}')
+
+    def create_keyring_file(self, remote, keyring):
+        keyring_path = remote.mktemp(data=keyring)
+
+        # required when triggered using vstart_runner.py.
+        remote.run(args=['chmod', '644', keyring_path])
+
+        return keyring_path
diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py
new file mode 100644
index 000000000..35b80106d
--- /dev/null
+++ b/qa/tasks/cephfs/filesystem.py
@@ -0,0 +1,1653 @@
+
+import json
+import logging
+from gevent import Greenlet
+import os
+import time
+import datetime
+import re
+import errno
+import random
+import traceback
+
+from io import BytesIO, StringIO
+from errno import EBUSY
+
+from teuthology.exceptions import CommandFailedError
+from teuthology import misc
+from teuthology.nuke import clear_firewall
+from teuthology.parallel import parallel
+from teuthology import contextutil
+from tasks.ceph_manager import write_conf
+from tasks import ceph_manager
+
+
+log = logging.getLogger(__name__)
+
+
+DAEMON_WAIT_TIMEOUT = 120
+ROOT_INO = 1
+
+class FileLayout(object):
+    def __init__(self, pool=None, pool_namespace=None, stripe_unit=None, stripe_count=None, object_size=None):
+        self.pool = pool
+        self.pool_namespace = pool_namespace
+        self.stripe_unit = stripe_unit
+        self.stripe_count = stripe_count
+        self.object_size = object_size
+
+    @classmethod
+    def load_from_ceph(layout_str):
+        # TODO
+        pass
+
+    def items(self):
+        if self.pool is not None:
+            yield ("pool", self.pool)
+        if self.pool_namespace:
+            yield ("pool_namespace", self.pool_namespace)
+        if self.stripe_unit is not None:
+            yield ("stripe_unit", self.stripe_unit)
+        if self.stripe_count is not None:
+            yield ("stripe_count", self.stripe_count)
+        if self.object_size is not None:
+            yield ("object_size", self.stripe_size)
+
+class ObjectNotFound(Exception):
+    def __init__(self, object_name):
+        self._object_name = object_name
+
+    def __str__(self):
+        return "Object not found: '{0}'".format(self._object_name)
+
+class FSMissing(Exception):
+    def __init__(self, ident):
+        self.ident = ident
+
+    def __str__(self):
+        return f"File system {self.ident} does not exist in the map"
+
+class FSStatus(object):
+    """
+    Operations on a snapshot of the FSMap.
+    """
+    def __init__(self, mon_manager, epoch=None):
+        self.mon = mon_manager
+        cmd = ["fs", "dump", "--format=json"]
+        if epoch is not None:
+            cmd.append(str(epoch))
+        self.map = json.loads(self.mon.raw_cluster_cmd(*cmd))
+
+    def __str__(self):
+        return json.dumps(self.map, indent = 2, sort_keys = True)
+
+    # Expose the fsmap for manual inspection.
+    def __getitem__(self, key):
+        """
+        Get a field from the fsmap.
+        """
+        return self.map[key]
+
+    def get_filesystems(self):
+        """
+        Iterator for all filesystems.
+        """
+        for fs in self.map['filesystems']:
+            yield fs
+
+    def get_all(self):
+        """
+        Iterator for all the mds_info components in the FSMap.
+        """
+        for info in self.map['standbys']:
+            yield info
+        for fs in self.map['filesystems']:
+            for info in fs['mdsmap']['info'].values():
+                yield info
+
+    def get_standbys(self):
+        """
+        Iterator for all standbys.
+        """
+        for info in self.map['standbys']:
+            yield info
+
+    def get_fsmap(self, fscid):
+        """
+        Get the fsmap for the given FSCID.
+        """
+        for fs in self.map['filesystems']:
+            if fscid is None or fs['id'] == fscid:
+                return fs
+        raise FSMissing(fscid)
+
+    def get_fsmap_byname(self, name):
+        """
+        Get the fsmap for the given file system name.
+        """
+        for fs in self.map['filesystems']:
+            if name is None or fs['mdsmap']['fs_name'] == name:
+                return fs
+        raise FSMissing(name)
+
+    def get_replays(self, fscid):
+        """
+        Get the standby:replay MDS for the given FSCID.
+        """
+        fs = self.get_fsmap(fscid)
+        for info in fs['mdsmap']['info'].values():
+            if info['state'] == 'up:standby-replay':
+                yield info
+
+    def get_ranks(self, fscid):
+        """
+        Get the ranks for the given FSCID.
+        """
+        fs = self.get_fsmap(fscid)
+        for info in fs['mdsmap']['info'].values():
+            if info['rank'] >= 0 and info['state'] != 'up:standby-replay':
+                yield info
+
+    def get_damaged(self, fscid):
+        """
+        Get the damaged ranks for the given FSCID.
+        """
+        fs = self.get_fsmap(fscid)
+        return fs['mdsmap']['damaged']
+
+    def get_rank(self, fscid, rank):
+        """
+        Get the rank for the given FSCID.
+        """
+        for info in self.get_ranks(fscid):
+            if info['rank'] == rank:
+                return info
+        raise RuntimeError("FSCID {0} has no rank {1}".format(fscid, rank))
+
+    def get_mds(self, name):
+        """
+        Get the info for the given MDS name.
+        """
+        for info in self.get_all():
+            if info['name'] == name:
+                return info
+        return None
+
+    def get_mds_addr(self, name):
+        """
+        Return the instance addr as a string, like "10.214.133.138:6807\/10825"
+        """
+        info = self.get_mds(name)
+        if info:
+            return info['addr']
+        else:
+            log.warning(json.dumps(list(self.get_all()), indent=2))  # dump for debugging
+            raise RuntimeError("MDS id '{0}' not found in map".format(name))
+
+    def get_mds_addrs(self, name):
+        """
+        Return the instance addr as a string, like "[10.214.133.138:6807 10.214.133.138:6808]"
+        """
+        info = self.get_mds(name)
+        if info:
+            return [e['addr'] for e in info['addrs']['addrvec']]
+        else:
+            log.warn(json.dumps(list(self.get_all()), indent=2))  # dump for debugging
+            raise RuntimeError("MDS id '{0}' not found in map".format(name))
+
+    def get_mds_gid(self, gid):
+        """
+        Get the info for the given MDS gid.
+        """
+        for info in self.get_all():
+            if info['gid'] == gid:
+                return info
+        return None
+
+    def hadfailover(self, status):
+        """
+        Compares two statuses for mds failovers.
+        Returns True if there is a failover.
+        """
+        for fs in status.map['filesystems']:
+            for info in fs['mdsmap']['info'].values():
+                oldinfo = self.get_mds_gid(info['gid'])
+                if oldinfo is None or oldinfo['incarnation'] != info['incarnation']:
+                    return True
+        #all matching
+        return False
+
+class CephCluster(object):
+    @property
+    def admin_remote(self):
+        first_mon = misc.get_first_mon(self._ctx, None)
+        (result,) = self._ctx.cluster.only(first_mon).remotes.keys()
+        return result
+
+    def __init__(self, ctx) -> None:
+        self._ctx = ctx
+        self.mon_manager = ceph_manager.CephManager(self.admin_remote, ctx=ctx, logger=log.getChild('ceph_manager'))
+
+    def get_config(self, key, service_type=None):
+        """
+        Get config from mon by default, or a specific service if caller asks for it
+        """
+        if service_type is None:
+            service_type = 'mon'
+
+        service_id = sorted(misc.all_roles_of_type(self._ctx.cluster, service_type))[0]
+        return self.json_asok(['config', 'get', key], service_type, service_id)[key]
+
+    def set_ceph_conf(self, subsys, key, value):
+        if subsys not in self._ctx.ceph['ceph'].conf:
+            self._ctx.ceph['ceph'].conf[subsys] = {}
+        self._ctx.ceph['ceph'].conf[subsys][key] = value
+        write_conf(self._ctx)  # XXX because we don't have the ceph task's config object, if they
+                               # used a different config path this won't work.
+
+    def clear_ceph_conf(self, subsys, key):
+        del self._ctx.ceph['ceph'].conf[subsys][key]
+        write_conf(self._ctx)
+
+    def json_asok(self, command, service_type, service_id, timeout=None):
+        if timeout is None:
+            timeout = 15*60
+        command.insert(0, '--format=json')
+        proc = self.mon_manager.admin_socket(service_type, service_id, command, timeout=timeout)
+        response_data = proc.stdout.getvalue().strip()
+        if len(response_data) > 0:
+            j = json.loads(response_data)
+            pretty = json.dumps(j, sort_keys=True, indent=2)
+            log.debug(f"_json_asok output\n{pretty}")
+            return j
+        else:
+            log.debug("_json_asok output empty")
+            return None
+
+    def is_addr_blocklisted(self, addr):
+        blocklist = json.loads(self.mon_manager.raw_cluster_cmd(
+            "osd", "dump", "--format=json"))['blocklist']
+        if addr in blocklist:
+            return True
+        log.warn(f'The address {addr} is not blocklisted')
+        return False
+
+
+class MDSCluster(CephCluster):
+    """
+    Collective operations on all the MDS daemons in the Ceph cluster.  These
+    daemons may be in use by various Filesystems.
+
+    For the benefit of pre-multi-filesystem tests, this class is also
+    a parent of Filesystem.  The correct way to use MDSCluster going forward is
+    as a separate instance outside of your (multiple) Filesystem instances.
+    """
+
+    def __init__(self, ctx):
+        super(MDSCluster, self).__init__(ctx)
+
+    @property
+    def mds_ids(self):
+        # do this dynamically because the list of ids may change periodically with cephadm
+        return list(misc.all_roles_of_type(self._ctx.cluster, 'mds'))
+
+    @property
+    def mds_daemons(self):
+        return dict([(mds_id, self._ctx.daemons.get_daemon('mds', mds_id)) for mds_id in self.mds_ids])
+
+    def _one_or_all(self, mds_id, cb, in_parallel=True):
+        """
+        Call a callback for a single named MDS, or for all.
+
+        Note that the parallelism here isn't for performance, it's to avoid being overly kind
+        to the cluster by waiting a graceful ssh-latency of time between doing things, and to
+        avoid being overly kind by executing them in a particular order.  However, some actions
+        don't cope with being done in parallel, so it's optional (`in_parallel`)
+
+        :param mds_id: MDS daemon name, or None
+        :param cb: Callback taking single argument of MDS daemon name
+        :param in_parallel: whether to invoke callbacks concurrently (else one after the other)
+        """
+
+        if mds_id is None:
+            if in_parallel:
+                with parallel() as p:
+                    for mds_id in self.mds_ids:
+                        p.spawn(cb, mds_id)
+            else:
+                for mds_id in self.mds_ids:
+                    cb(mds_id)
+        else:
+            cb(mds_id)
+
+    def get_config(self, key, service_type=None):
+        """
+        get_config specialization of service_type="mds"
+        """
+        if service_type != "mds":
+            return super(MDSCluster, self).get_config(key, service_type)
+
+        # Some tests stop MDS daemons, don't send commands to a dead one:
+        running_daemons = [i for i, mds in self.mds_daemons.items() if mds.running()]
+        service_id = random.sample(running_daemons, 1)[0]
+        return self.json_asok(['config', 'get', key], service_type, service_id)[key]
+
+    def mds_stop(self, mds_id=None):
+        """
+        Stop the MDS daemon process(se).  If it held a rank, that rank
+        will eventually go laggy.
+        """
+        self._one_or_all(mds_id, lambda id_: self.mds_daemons[id_].stop())
+
+    def mds_fail(self, mds_id=None):
+        """
+        Inform MDSMonitor of the death of the daemon process(es).  If it held
+        a rank, that rank will be relinquished.
+        """
+        self._one_or_all(mds_id, lambda id_: self.mon_manager.raw_cluster_cmd("mds", "fail", id_))
+
+    def mds_restart(self, mds_id=None):
+        self._one_or_all(mds_id, lambda id_: self.mds_daemons[id_].restart())
+
+    def mds_fail_restart(self, mds_id=None):
+        """
+        Variation on restart that includes marking MDSs as failed, so that doing this
+        operation followed by waiting for healthy daemon states guarantees that they
+        have gone down and come up, rather than potentially seeing the healthy states
+        that existed before the restart.
+        """
+        def _fail_restart(id_):
+            self.mds_daemons[id_].stop()
+            self.mon_manager.raw_cluster_cmd("mds", "fail", id_)
+            self.mds_daemons[id_].restart()
+
+        self._one_or_all(mds_id, _fail_restart)
+
+    def mds_signal(self, mds_id, sig, silent=False):
+        """
+        signal a MDS daemon
+        """
+        self.mds_daemons[mds_id].signal(sig, silent);
+
+    def newfs(self, name='cephfs', create=True):
+        return Filesystem(self._ctx, name=name, create=create)
+
+    def status(self, epoch=None):
+        return FSStatus(self.mon_manager, epoch)
+
+    def get_standby_daemons(self):
+        return set([s['name'] for s in self.status().get_standbys()])
+
+    def get_mds_hostnames(self):
+        result = set()
+        for mds_id in self.mds_ids:
+            mds_remote = self.mon_manager.find_remote('mds', mds_id)
+            result.add(mds_remote.hostname)
+
+        return list(result)
+
+    def set_clients_block(self, blocked, mds_id=None):
+        """
+        Block (using iptables) client communications to this MDS.  Be careful: if
+        other services are running on this MDS, or other MDSs try to talk to this
+        MDS, their communications may also be blocked as collatoral damage.
+
+        :param mds_id: Optional ID of MDS to block, default to all
+        :return:
+        """
+        da_flag = "-A" if blocked else "-D"
+
+        def set_block(_mds_id):
+            remote = self.mon_manager.find_remote('mds', _mds_id)
+            status = self.status()
+
+            addr = status.get_mds_addr(_mds_id)
+            ip_str, port_str, inst_str = re.match("(.+):(.+)/(.+)", addr).groups()
+
+            remote.run(
+                args=["sudo", "iptables", da_flag, "OUTPUT", "-p", "tcp", "--sport", port_str, "-j", "REJECT", "-m",
+                      "comment", "--comment", "teuthology"])
+            remote.run(
+                args=["sudo", "iptables", da_flag, "INPUT", "-p", "tcp", "--dport", port_str, "-j", "REJECT", "-m",
+                      "comment", "--comment", "teuthology"])
+
+        self._one_or_all(mds_id, set_block, in_parallel=False)
+
+    def set_inter_mds_block(self, blocked, mds_rank_1, mds_rank_2):
+        """
+        Block (using iptables) communications from a provided MDS to other MDSs.
+        Block all ports that an MDS uses for communication.
+
+        :param blocked: True to block the MDS, False otherwise
+        :param mds_rank_1: MDS rank
+        :param mds_rank_2: MDS rank
+        :return:
+        """
+        da_flag = "-A" if blocked else "-D"
+
+        def set_block(mds_ids):
+            status = self.status()
+
+            mds = mds_ids[0]
+            remote = self.mon_manager.find_remote('mds', mds)
+            addrs = status.get_mds_addrs(mds)
+            for addr in addrs:
+                ip_str, port_str = re.match("(.+):(.+)", addr).groups()
+                remote.run(
+                    args=["sudo", "iptables", da_flag, "INPUT", "-p", "tcp", "--dport", port_str, "-j", "REJECT", "-m",
+                          "comment", "--comment", "teuthology"])
+
+
+            mds = mds_ids[1]
+            remote = self.mon_manager.find_remote('mds', mds)
+            addrs = status.get_mds_addrs(mds)
+            for addr in addrs:
+                ip_str, port_str = re.match("(.+):(.+)", addr).groups()
+                remote.run(
+                    args=["sudo", "iptables", da_flag, "OUTPUT", "-p", "tcp", "--sport", port_str, "-j", "REJECT", "-m",
+                          "comment", "--comment", "teuthology"])
+                remote.run(
+                    args=["sudo", "iptables", da_flag, "INPUT", "-p", "tcp", "--dport", port_str, "-j", "REJECT", "-m",
+                          "comment", "--comment", "teuthology"])
+
+        self._one_or_all((mds_rank_1, mds_rank_2), set_block, in_parallel=False)
+
+    def clear_firewall(self):
+        clear_firewall(self._ctx)
+
+    def get_mds_info(self, mds_id):
+        return FSStatus(self.mon_manager).get_mds(mds_id)
+
+    def is_pool_full(self, pool_name):
+        pools = json.loads(self.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools']
+        for pool in pools:
+            if pool['pool_name'] == pool_name:
+                return 'full' in pool['flags_names'].split(",")
+
+        raise RuntimeError("Pool not found '{0}'".format(pool_name))
+
+    def delete_all_filesystems(self):
+        """
+        Remove all filesystems that exist, and any pools in use by them.
+        """
+        for fs in self.status().get_filesystems():
+            Filesystem(ctx=self._ctx, fscid=fs['id']).destroy()
+
+
+class Filesystem(MDSCluster):
+    """
+    This object is for driving a CephFS filesystem.  The MDS daemons driven by
+    MDSCluster may be shared with other Filesystems.
+    """
+    def __init__(self, ctx, fs_config={}, fscid=None, name=None, create=False):
+        super(Filesystem, self).__init__(ctx)
+
+        self.name = name
+        self.id = None
+        self.metadata_pool_name = None
+        self.metadata_overlay = False
+        self.data_pool_name = None
+        self.data_pools = None
+        self.fs_config = fs_config
+        self.ec_profile = fs_config.get('ec_profile')
+
+        client_list = list(misc.all_roles_of_type(self._ctx.cluster, 'client'))
+        self.client_id = client_list[0]
+        self.client_remote = list(misc.get_clients(ctx=ctx, roles=["client.{0}".format(self.client_id)]))[0][1]
+
+        if name is not None:
+            if fscid is not None:
+                raise RuntimeError("cannot specify fscid when creating fs")
+            if create and not self.legacy_configured():
+                self.create()
+        else:
+            if fscid is not None:
+                self.id = fscid
+                self.getinfo(refresh = True)
+
+        # Stash a reference to the first created filesystem on ctx, so
+        # that if someone drops to the interactive shell they can easily
+        # poke our methods.
+        if not hasattr(self._ctx, "filesystem"):
+            self._ctx.filesystem = self
+
+    def dead(self):
+        try:
+            return not bool(self.get_mds_map())
+        except FSMissing:
+            return True
+
+    def get_task_status(self, status_key):
+        return self.mon_manager.get_service_task_status("mds", status_key)
+
+    def getinfo(self, refresh = False):
+        status = self.status()
+        if self.id is not None:
+            fsmap = status.get_fsmap(self.id)
+        elif self.name is not None:
+            fsmap = status.get_fsmap_byname(self.name)
+        else:
+            fss = [fs for fs in status.get_filesystems()]
+            if len(fss) == 1:
+                fsmap = fss[0]
+            elif len(fss) == 0:
+                raise RuntimeError("no file system available")
+            else:
+                raise RuntimeError("more than one file system available")
+        self.id = fsmap['id']
+        self.name = fsmap['mdsmap']['fs_name']
+        self.get_pool_names(status = status, refresh = refresh)
+        return status
+
+    def set_metadata_overlay(self, overlay):
+        if self.id is not None:
+            raise RuntimeError("cannot specify fscid when configuring overlay")
+        self.metadata_overlay = overlay
+
+    def deactivate(self, rank):
+        if rank < 0:
+            raise RuntimeError("invalid rank")
+        elif rank == 0:
+            raise RuntimeError("cannot deactivate rank 0")
+        self.mon_manager.raw_cluster_cmd("mds", "deactivate", "%d:%d" % (self.id, rank))
+
+    def reach_max_mds(self):
+        # Try to reach rank count == max_mds, up or down (UPGRADE SENSITIVE!)
+        status = self.getinfo()
+        mds_map = self.get_mds_map(status=status)
+        max_mds = mds_map['max_mds']
+
+        count = len(list(self.get_ranks(status=status)))
+        if count > max_mds:
+            try:
+                # deactivate mds in decending order
+                status = self.wait_for_daemons(status=status, skip_max_mds_check=True)
+                while count > max_mds:
+                    targets = sorted(self.get_ranks(status=status), key=lambda r: r['rank'], reverse=True)
+                    target = targets[0]
+                    log.debug("deactivating rank %d" % target['rank'])
+                    self.deactivate(target['rank'])
+                    status = self.wait_for_daemons(skip_max_mds_check=True)
+                    count = len(list(self.get_ranks(status=status)))
+            except:
+                # In Mimic, deactivation is done automatically:
+                log.info("Error:\n{}".format(traceback.format_exc()))
+                status = self.wait_for_daemons()
+        else:
+            status = self.wait_for_daemons()
+
+        mds_map = self.get_mds_map(status=status)
+        assert(mds_map['max_mds'] == max_mds)
+        assert(mds_map['in'] == list(range(0, max_mds)))
+
+    def reset(self):
+        self.mon_manager.raw_cluster_cmd("fs", "reset", str(self.name), '--yes-i-really-mean-it')
+
+    def fail(self):
+        self.mon_manager.raw_cluster_cmd("fs", "fail", str(self.name))
+
+    def set_flag(self, var, *args):
+        a = map(lambda x: str(x).lower(), args)
+        self.mon_manager.raw_cluster_cmd("fs", "flag", "set", var, *a)
+
+    def set_allow_multifs(self, yes=True):
+        self.set_flag("enable_multiple", yes)
+
+    def set_var(self, var, *args):
+        a = map(lambda x: str(x).lower(), args)
+        self.mon_manager.raw_cluster_cmd("fs", "set", self.name, var, *a)
+
+    def set_down(self, down=True):
+        self.set_var("down", str(down).lower())
+
+    def set_joinable(self, joinable=True):
+        self.set_var("joinable", joinable)
+
+    def set_max_mds(self, max_mds):
+        self.set_var("max_mds", "%d" % max_mds)
+
+    def set_session_timeout(self, timeout):
+        self.set_var("session_timeout", "%d" % timeout)
+
+    def set_allow_standby_replay(self, yes):
+        self.set_var("allow_standby_replay", yes)
+
+    def set_allow_new_snaps(self, yes):
+        self.set_var("allow_new_snaps", yes, '--yes-i-really-mean-it')
+
+    def compat(self, *args):
+        a = map(lambda x: str(x).lower(), args)
+        self.mon_manager.raw_cluster_cmd("fs", "compat", self.name, *a)
+
+    def add_compat(self, *args):
+        self.compat("add_compat", *args)
+
+    def add_incompat(self, *args):
+        self.compat("add_incompat", *args)
+
+    def rm_compat(self, *args):
+        self.compat("rm_compat", *args)
+
+    def rm_incompat(self, *args):
+        self.compat("rm_incompat", *args)
+
+    def required_client_features(self, *args, **kwargs):
+        c = ["fs", "required_client_features", self.name, *args]
+        return self.mon_manager.run_cluster_cmd(args=c, **kwargs)
+
+    # Since v15.1.0 the pg autoscale mode has been enabled as default,
+    # will let the pg autoscale mode to calculate the pg_num as needed.
+    # We set the pg_num_min to 64 to make sure that pg autoscale mode
+    # won't set the pg_num to low to fix Tracker#45434.
+    pg_num = 64
+    pg_num_min = 64
+    target_size_ratio = 0.9
+    target_size_ratio_ec = 0.9
+
+    def create(self):
+        if self.name is None:
+            self.name = "cephfs"
+        if self.metadata_pool_name is None:
+            self.metadata_pool_name = "{0}_metadata".format(self.name)
+        if self.data_pool_name is None:
+            data_pool_name = "{0}_data".format(self.name)
+        else:
+            data_pool_name = self.data_pool_name
+
+        # will use the ec pool to store the data and a small amount of
+        # metadata still goes to the primary data pool for all files.
+        if not self.metadata_overlay and self.ec_profile and 'disabled' not in self.ec_profile:
+            self.target_size_ratio = 0.05
+
+        log.debug("Creating filesystem '{0}'".format(self.name))
+
+        try:
+            self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
+                                             self.metadata_pool_name, str(self.pg_num),
+                                             '--pg_num_min', str(self.pg_num_min))
+
+            self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
+                                             data_pool_name, str(self.pg_num),
+                                             '--pg_num_min', str(self.pg_num_min),
+                                             '--target_size_ratio',
+                                             str(self.target_size_ratio))
+        except CommandFailedError as e:
+            if e.exitstatus == 22: # nautilus couldn't specify --pg_num_min option
+                self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
+                                                 self.metadata_pool_name,
+                                                 str(self.pg_num_min))
+
+                self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
+                                                 data_pool_name, str(self.pg_num),
+                                                 str(self.pg_num_min))
+            else:
+                raise
+
+        if self.metadata_overlay:
+            self.mon_manager.raw_cluster_cmd('fs', 'new',
+                                             self.name, self.metadata_pool_name, data_pool_name,
+                                             '--allow-dangerous-metadata-overlay')
+        else:
+            self.mon_manager.raw_cluster_cmd('fs', 'new',
+                                             self.name,
+                                             self.metadata_pool_name,
+                                             data_pool_name)
+
+            if self.ec_profile and 'disabled' not in self.ec_profile:
+                ec_data_pool_name = data_pool_name + "_ec"
+                log.debug("EC profile is %s", self.ec_profile)
+                cmd = ['osd', 'erasure-code-profile', 'set', ec_data_pool_name]
+                cmd.extend(self.ec_profile)
+                self.mon_manager.raw_cluster_cmd(*cmd)
+                try:
+                    self.mon_manager.raw_cluster_cmd(
+                        'osd', 'pool', 'create', ec_data_pool_name,
+                        'erasure', ec_data_pool_name,
+                        '--pg_num_min', str(self.pg_num_min),
+                        '--target_size_ratio', str(self.target_size_ratio_ec))
+                except CommandFailedError as e:
+                    if e.exitstatus == 22: # nautilus couldn't specify --pg_num_min option
+                        self.mon_manager.raw_cluster_cmd(
+                            'osd', 'pool', 'create', ec_data_pool_name,
+                            str(self.pg_num_min), 'erasure', ec_data_pool_name)
+                    else:
+                        raise
+                self.mon_manager.raw_cluster_cmd(
+                    'osd', 'pool', 'set',
+                    ec_data_pool_name, 'allow_ec_overwrites', 'true')
+                self.add_data_pool(ec_data_pool_name, create=False)
+                self.check_pool_application(ec_data_pool_name)
+
+                self.run_client_payload(f"setfattr -n ceph.dir.layout.pool -v {ec_data_pool_name} . && getfattr -n ceph.dir.layout .")
+
+        self.check_pool_application(self.metadata_pool_name)
+        self.check_pool_application(data_pool_name)
+
+        # Turn off spurious standby count warnings from modifying max_mds in tests.
+        try:
+            self.mon_manager.raw_cluster_cmd('fs', 'set', self.name, 'standby_count_wanted', '0')
+        except CommandFailedError as e:
+            if e.exitstatus == 22:
+                # standby_count_wanted not available prior to luminous (upgrade tests would fail otherwise)
+                pass
+            else:
+                raise
+
+        if self.fs_config is not None:
+            max_mds = self.fs_config.get('max_mds', 1)
+            if max_mds > 1:
+                self.set_max_mds(max_mds)
+
+            standby_replay = self.fs_config.get('standby_replay', False)
+            self.set_allow_standby_replay(standby_replay)
+
+            # If absent will use the default value (60 seconds)
+            session_timeout = self.fs_config.get('session_timeout', 60)
+            if session_timeout != 60:
+                self.set_session_timeout(session_timeout)
+
+        self.getinfo(refresh = True)
+
+        # wait pgs to be clean
+        self.mon_manager.wait_for_clean()
+
+    def run_client_payload(self, cmd):
+        # avoid circular dep by importing here:
+        from tasks.cephfs.fuse_mount import FuseMount
+
+        # Wait for at MDS daemons to be ready before mounting the
+        # ceph-fuse client in run_client_payload()
+        self.wait_for_daemons()
+
+        d = misc.get_testdir(self._ctx)
+        m = FuseMount(self._ctx, {}, d, "admin", self.client_remote, cephfs_name=self.name)
+        m.mount_wait()
+        m.run_shell_payload(cmd)
+        m.umount_wait(require_clean=True)
+
+    def _remove_pool(self, name, **kwargs):
+        c = f'osd pool rm {name} {name} --yes-i-really-really-mean-it'
+        return self.mon_manager.ceph(c, **kwargs)
+
+    def rm(self, **kwargs):
+        c = f'fs rm {self.name} --yes-i-really-mean-it'
+        return self.mon_manager.ceph(c, **kwargs)
+
+    def remove_pools(self, data_pools):
+        self._remove_pool(self.get_metadata_pool_name())
+        for poolname in data_pools:
+            try:
+                self._remove_pool(poolname)
+            except CommandFailedError as e:
+                # EBUSY, this data pool is used by two metadata pools, let the
+                # 2nd pass delete it
+                if e.exitstatus == EBUSY:
+                    pass
+                else:
+                    raise
+
+    def destroy(self, reset_obj_attrs=True):
+        log.info(f'Destroying file system {self.name} and related pools')
+
+        if self.dead():
+            log.debug('already dead...')
+            return
+
+        data_pools = self.get_data_pool_names(refresh=True)
+
+        # make sure no MDSs are attached to given FS.
+        self.fail()
+        self.rm()
+
+        self.remove_pools(data_pools)
+
+        if reset_obj_attrs:
+            self.id = None
+            self.name = None
+            self.metadata_pool_name = None
+            self.data_pool_name = None
+            self.data_pools = None
+
+    def recreate(self):
+        self.destroy()
+
+        self.create()
+        self.getinfo(refresh=True)
+
+    def check_pool_application(self, pool_name):
+        osd_map = self.mon_manager.get_osd_dump_json()
+        for pool in osd_map['pools']:
+            if pool['pool_name'] == pool_name:
+                if "application_metadata" in pool:
+                    if not "cephfs" in pool['application_metadata']:
+                        raise RuntimeError("Pool {pool_name} does not name cephfs as application!".\
+                                           format(pool_name=pool_name))
+
+    def __del__(self):
+        if getattr(self._ctx, "filesystem", None) == self:
+            delattr(self._ctx, "filesystem")
+
+    def exists(self):
+        """
+        Whether a filesystem exists in the mon's filesystem list
+        """
+        fs_list = json.loads(self.mon_manager.raw_cluster_cmd('fs', 'ls', '--format=json-pretty'))
+        return self.name in [fs['name'] for fs in fs_list]
+
+    def legacy_configured(self):
+        """
+        Check if a legacy (i.e. pre "fs new") filesystem configuration is present.  If this is
+        the case, the caller should avoid using Filesystem.create
+        """
+        try:
+            out_text = self.mon_manager.raw_cluster_cmd('--format=json-pretty', 'osd', 'lspools')
+            pools = json.loads(out_text)
+            metadata_pool_exists = 'metadata' in [p['poolname'] for p in pools]
+            if metadata_pool_exists:
+                self.metadata_pool_name = 'metadata'
+        except CommandFailedError as e:
+            # For use in upgrade tests, Ceph cuttlefish and earlier don't support
+            # structured output (--format) from the CLI.
+            if e.exitstatus == 22:
+                metadata_pool_exists = True
+            else:
+                raise
+
+        return metadata_pool_exists
+
+    def _df(self):
+        return json.loads(self.mon_manager.raw_cluster_cmd("df", "--format=json-pretty"))
+
+    # may raise FSMissing
+    def get_mds_map(self, status=None):
+        if status is None:
+            status = self.status()
+        return status.get_fsmap(self.id)['mdsmap']
+
+    def get_var(self, var, status=None):
+        return self.get_mds_map(status=status)[var]
+
+    def set_dir_layout(self, mount, path, layout):
+        for name, value in layout.items():
+            mount.run_shell(args=["setfattr", "-n", "ceph.dir.layout."+name, "-v", str(value), path])
+
+    def add_data_pool(self, name, create=True):
+        if create:
+            try:
+                self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', name,
+                                                 '--pg_num_min', str(self.pg_num_min))
+            except CommandFailedError as e:
+                if e.exitstatus == 22: # nautilus couldn't specify --pg_num_min option
+                  self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', name,
+                                                   str(self.pg_num_min))
+                else:
+                    raise
+        self.mon_manager.raw_cluster_cmd('fs', 'add_data_pool', self.name, name)
+        self.get_pool_names(refresh = True)
+        for poolid, fs_name in self.data_pools.items():
+            if name == fs_name:
+                return poolid
+        raise RuntimeError("could not get just created pool '{0}'".format(name))
+
+    def get_pool_names(self, refresh = False, status = None):
+        if refresh or self.metadata_pool_name is None or self.data_pools is None:
+            if status is None:
+                status = self.status()
+            fsmap = status.get_fsmap(self.id)
+
+            osd_map = self.mon_manager.get_osd_dump_json()
+            id_to_name = {}
+            for p in osd_map['pools']:
+                id_to_name[p['pool']] = p['pool_name']
+
+            self.metadata_pool_name = id_to_name[fsmap['mdsmap']['metadata_pool']]
+            self.data_pools = {}
+            for data_pool in fsmap['mdsmap']['data_pools']:
+                self.data_pools[data_pool] = id_to_name[data_pool]
+
+    def get_data_pool_name(self, refresh = False):
+        if refresh or self.data_pools is None:
+            self.get_pool_names(refresh = True)
+        assert(len(self.data_pools) == 1)
+        return next(iter(self.data_pools.values()))
+
+    def get_data_pool_id(self, refresh = False):
+        """
+        Don't call this if you have multiple data pools
+        :return: integer
+        """
+        if refresh or self.data_pools is None:
+            self.get_pool_names(refresh = True)
+        assert(len(self.data_pools) == 1)
+        return next(iter(self.data_pools.keys()))
+
+    def get_data_pool_names(self, refresh = False):
+        if refresh or self.data_pools is None:
+            self.get_pool_names(refresh = True)
+        return list(self.data_pools.values())
+
+    def get_metadata_pool_name(self):
+        return self.metadata_pool_name
+
+    def set_data_pool_name(self, name):
+        if self.id is not None:
+            raise RuntimeError("can't set filesystem name if its fscid is set")
+        self.data_pool_name = name
+
+    def get_pool_pg_num(self, pool_name):
+        pgs = json.loads(self.mon_manager.raw_cluster_cmd('osd', 'pool', 'get',
+                                                          pool_name, 'pg_num',
+                                                          '--format=json-pretty'))
+        return int(pgs['pg_num'])
+
+    def get_namespace_id(self):
+        return self.id
+
+    def get_pool_df(self, pool_name):
+        """
+        Return a dict like:
+        {u'bytes_used': 0, u'max_avail': 83848701, u'objects': 0, u'kb_used': 0}
+        """
+        for pool_df in self._df()['pools']:
+            if pool_df['name'] == pool_name:
+                return pool_df['stats']
+
+        raise RuntimeError("Pool name '{0}' not found".format(pool_name))
+
+    def get_usage(self):
+        return self._df()['stats']['total_used_bytes']
+
+    def are_daemons_healthy(self, status=None, skip_max_mds_check=False):
+        """
+        Return true if all daemons are in one of active, standby, standby-replay, and
+        at least max_mds daemons are in 'active'.
+
+        Unlike most of Filesystem, this function is tolerant of new-style `fs`
+        commands being missing, because we are part of the ceph installation
+        process during upgrade suites, so must fall back to old style commands
+        when we get an EINVAL on a new style command.
+
+        :return:
+        """
+        # First, check to see that processes haven't exited with an error code
+        for mds in self._ctx.daemons.iter_daemons_of_role('mds'):
+            mds.check_status()
+
+        active_count = 0
+        mds_map = self.get_mds_map(status=status)
+
+        log.debug("are_daemons_healthy: mds map: {0}".format(mds_map))
+
+        for mds_id, mds_status in mds_map['info'].items():
+            if mds_status['state'] not in ["up:active", "up:standby", "up:standby-replay"]:
+                log.warning("Unhealthy mds state {0}:{1}".format(mds_id, mds_status['state']))
+                return False
+            elif mds_status['state'] == 'up:active':
+                active_count += 1
+
+        log.debug("are_daemons_healthy: {0}/{1}".format(
+            active_count, mds_map['max_mds']
+        ))
+
+        if not skip_max_mds_check:
+            if active_count > mds_map['max_mds']:
+                log.debug("are_daemons_healthy: number of actives is greater than max_mds: {0}".format(mds_map))
+                return False
+            elif active_count == mds_map['max_mds']:
+                # The MDSMap says these guys are active, but let's check they really are
+                for mds_id, mds_status in mds_map['info'].items():
+                    if mds_status['state'] == 'up:active':
+                        try:
+                            daemon_status = self.mds_tell(["status"], mds_id=mds_status['name'])
+                        except CommandFailedError as cfe:
+                            if cfe.exitstatus == errno.EINVAL:
+                                # Old version, can't do this check
+                                continue
+                            else:
+                                # MDS not even running
+                                return False
+
+                        if daemon_status['state'] != 'up:active':
+                            # MDS hasn't taken the latest map yet
+                            return False
+
+                return True
+            else:
+                return False
+        else:
+            log.debug("are_daemons_healthy: skipping max_mds check")
+            return True
+
+    def get_daemon_names(self, state=None, status=None):
+        """
+        Return MDS daemon names of those daemons in the given state
+        :param state:
+        :return:
+        """
+        mdsmap = self.get_mds_map(status)
+        result = []
+        for mds_status in sorted(mdsmap['info'].values(),
+                                 key=lambda _: _['rank']):
+            if mds_status['state'] == state or state is None:
+                result.append(mds_status['name'])
+
+        return result
+
+    def get_active_names(self, status=None):
+        """
+        Return MDS daemon names of those daemons holding ranks
+        in state up:active
+
+        :return: list of strings like ['a', 'b'], sorted by rank
+        """
+        return self.get_daemon_names("up:active", status=status)
+
+    def get_all_mds_rank(self, status=None):
+        mdsmap = self.get_mds_map(status)
+        result = []
+        for mds_status in sorted(mdsmap['info'].values(),
+                                 key=lambda _: _['rank']):
+            if mds_status['rank'] != -1 and mds_status['state'] != 'up:standby-replay':
+                result.append(mds_status['rank'])
+
+        return result
+
+    def get_rank(self, rank=None, status=None):
+        if status is None:
+            status = self.getinfo()
+        if rank is None:
+            rank = 0
+        return status.get_rank(self.id, rank)
+
+    def rank_restart(self, rank=0, status=None):
+        name = self.get_rank(rank=rank, status=status)['name']
+        self.mds_restart(mds_id=name)
+
+    def rank_signal(self, signal, rank=0, status=None):
+        name = self.get_rank(rank=rank, status=status)['name']
+        self.mds_signal(name, signal)
+
+    def rank_freeze(self, yes, rank=0):
+        self.mon_manager.raw_cluster_cmd("mds", "freeze", "{}:{}".format(self.id, rank), str(yes).lower())
+
+    def rank_fail(self, rank=0):
+        self.mon_manager.raw_cluster_cmd("mds", "fail", "{}:{}".format(self.id, rank))
+
+    def get_ranks(self, status=None):
+        if status is None:
+            status = self.getinfo()
+        return status.get_ranks(self.id)
+
+    def get_damaged(self, status=None):
+        if status is None:
+            status = self.getinfo()
+        return status.get_damaged(self.id)
+
+    def get_replays(self, status=None):
+        if status is None:
+            status = self.getinfo()
+        return status.get_replays(self.id)
+
+    def get_replay(self, rank=0, status=None):
+        for replay in self.get_replays(status=status):
+            if replay['rank'] == rank:
+                return replay
+        return None
+
+    def get_rank_names(self, status=None):
+        """
+        Return MDS daemon names of those daemons holding a rank,
+        sorted by rank.  This includes e.g. up:replay/reconnect
+        as well as active, but does not include standby or
+        standby-replay.
+        """
+        mdsmap = self.get_mds_map(status)
+        result = []
+        for mds_status in sorted(mdsmap['info'].values(),
+                                 key=lambda _: _['rank']):
+            if mds_status['rank'] != -1 and mds_status['state'] != 'up:standby-replay':
+                result.append(mds_status['name'])
+
+        return result
+
+    def wait_for_daemons(self, timeout=None, skip_max_mds_check=False, status=None):
+        """
+        Wait until all daemons are healthy
+        :return:
+        """
+
+        if timeout is None:
+            timeout = DAEMON_WAIT_TIMEOUT
+
+        if status is None:
+            status = self.status()
+
+        elapsed = 0
+        while True:
+            if self.are_daemons_healthy(status=status, skip_max_mds_check=skip_max_mds_check):
+                return status
+            else:
+                time.sleep(1)
+                elapsed += 1
+
+            if elapsed > timeout:
+                log.debug("status = {0}".format(status))
+                raise RuntimeError("Timed out waiting for MDS daemons to become healthy")
+
+            status = self.status()
+
+    def dencoder(self, obj_type, obj_blob):
+        args = [os.path.join(self._prefix, "ceph-dencoder"), 'type', obj_type, 'import', '-', 'decode', 'dump_json']
+        p = self.mon_manager.controller.run(args=args, stdin=BytesIO(obj_blob), stdout=BytesIO())
+        return p.stdout.getvalue()
+
+    def rados(self, *args, **kwargs):
+        """
+        Callout to rados CLI.
+        """
+
+        return self.mon_manager.do_rados(*args, **kwargs)
+
+    def radosm(self, *args, **kwargs):
+        """
+        Interact with the metadata pool via rados CLI.
+        """
+
+        return self.rados(*args, **kwargs, pool=self.get_metadata_pool_name())
+
+    def radosmo(self, *args, stdout=BytesIO(), **kwargs):
+        """
+        Interact with the metadata pool via rados CLI. Get the stdout.
+        """
+
+        return self.radosm(*args, **kwargs, stdout=stdout).stdout.getvalue()
+
+    def get_metadata_object(self, object_type, object_id):
+        """
+        Retrieve an object from the metadata pool, pass it through
+        ceph-dencoder to dump it to JSON, and return the decoded object.
+        """
+
+        o = self.radosmo(['get', object_id, '-'])
+        j = self.dencoder(object_type, o)
+        try:
+            return json.loads(j)
+        except (TypeError, ValueError):
+            log.error("Failed to decode JSON: '{0}'".format(j))
+            raise
+
+    def get_journal_version(self):
+        """
+        Read the JournalPointer and Journal::Header objects to learn the version of
+        encoding in use.
+        """
+        journal_pointer_object = '400.00000000'
+        journal_pointer_dump = self.get_metadata_object("JournalPointer", journal_pointer_object)
+        journal_ino = journal_pointer_dump['journal_pointer']['front']
+
+        journal_header_object = "{0:x}.00000000".format(journal_ino)
+        journal_header_dump = self.get_metadata_object('Journaler::Header', journal_header_object)
+
+        version = journal_header_dump['journal_header']['stream_format']
+        log.debug("Read journal version {0}".format(version))
+
+        return version
+
+    def mds_asok(self, command, mds_id=None, timeout=None):
+        if mds_id is None:
+            return self.rank_asok(command, timeout=timeout)
+
+        return self.json_asok(command, 'mds', mds_id, timeout=timeout)
+
+    def mds_tell(self, command, mds_id=None):
+        if mds_id is None:
+            return self.rank_tell(command)
+
+        return json.loads(self.mon_manager.raw_cluster_cmd("tell", f"mds.{mds_id}", *command))
+
+    def rank_asok(self, command, rank=0, status=None, timeout=None):
+        info = self.get_rank(rank=rank, status=status)
+        return self.json_asok(command, 'mds', info['name'], timeout=timeout)
+
+    def rank_tell(self, command, rank=0, status=None):
+        try:
+            out = self.mon_manager.raw_cluster_cmd("tell", f"mds.{self.id}:{rank}", *command)
+            return json.loads(out)
+        except json.decoder.JSONDecodeError:
+            log.error("could not decode: {}".format(out))
+            raise
+
+    def ranks_tell(self, command, status=None):
+        if status is None:
+            status = self.status()
+        out = []
+        for r in status.get_ranks(self.id):
+            result = self.rank_tell(command, rank=r['rank'], status=status)
+            out.append((r['rank'], result))
+        return sorted(out)
+
+    def ranks_perf(self, f, status=None):
+        perf = self.ranks_tell(["perf", "dump"], status=status)
+        out = []
+        for rank, perf in perf:
+            out.append((rank, f(perf)))
+        return out
+
+    def read_cache(self, path, depth=None):
+        cmd = ["dump", "tree", path]
+        if depth is not None:
+            cmd.append(depth.__str__())
+        result = self.mds_asok(cmd)
+        if len(result) == 0:
+            raise RuntimeError("Path not found in cache: {0}".format(path))
+
+        return result
+
+    def wait_for_state(self, goal_state, reject=None, timeout=None, mds_id=None, rank=None):
+        """
+        Block until the MDS reaches a particular state, or a failure condition
+        is met.
+
+        When there are multiple MDSs, succeed when exaclty one MDS is in the
+        goal state, or fail when any MDS is in the reject state.
+
+        :param goal_state: Return once the MDS is in this state
+        :param reject: Fail if the MDS enters this state before the goal state
+        :param timeout: Fail if this many seconds pass before reaching goal
+        :return: number of seconds waited, rounded down to integer
+        """
+
+        started_at = time.time()
+        while True:
+            status = self.status()
+            if rank is not None:
+                try:
+                    mds_info = status.get_rank(self.id, rank)
+                    current_state = mds_info['state'] if mds_info else None
+                    log.debug("Looked up MDS state for mds.{0}: {1}".format(rank, current_state))
+                except:
+                    mdsmap = self.get_mds_map(status=status)
+                    if rank in mdsmap['failed']:
+                        log.debug("Waiting for rank {0} to come back.".format(rank))
+                        current_state = None
+                    else:
+                        raise
+            elif mds_id is not None:
+                # mds_info is None if no daemon with this ID exists in the map
+                mds_info = status.get_mds(mds_id)
+                current_state = mds_info['state'] if mds_info else None
+                log.debug("Looked up MDS state for {0}: {1}".format(mds_id, current_state))
+            else:
+                # In general, look for a single MDS
+                states = [m['state'] for m in status.get_ranks(self.id)]
+                if [s for s in states if s == goal_state] == [goal_state]:
+                    current_state = goal_state
+                elif reject in states:
+                    current_state = reject
+                else:
+                    current_state = None
+                log.debug("mapped states {0} to {1}".format(states, current_state))
+
+            elapsed = time.time() - started_at
+            if current_state == goal_state:
+                log.debug("reached state '{0}' in {1}s".format(current_state, elapsed))
+                return elapsed
+            elif reject is not None and current_state == reject:
+                raise RuntimeError("MDS in reject state {0}".format(current_state))
+            elif timeout is not None and elapsed > timeout:
+                log.error("MDS status at timeout: {0}".format(status.get_fsmap(self.id)))
+                raise RuntimeError(
+                    "Reached timeout after {0} seconds waiting for state {1}, while in state {2}".format(
+                        elapsed, goal_state, current_state
+                    ))
+            else:
+                time.sleep(1)
+
+    def _read_data_xattr(self, ino_no, xattr_name, obj_type, pool):
+        if pool is None:
+            pool = self.get_data_pool_name()
+
+        obj_name = "{0:x}.00000000".format(ino_no)
+
+        args = ["getxattr", obj_name, xattr_name]
+        try:
+            proc = self.rados(args, pool=pool, stdout=BytesIO())
+        except CommandFailedError as e:
+            log.error(e.__str__())
+            raise ObjectNotFound(obj_name)
+
+        obj_blob = proc.stdout.getvalue()
+        return json.loads(self.dencoder(obj_type, obj_blob).strip())
+
+    def _write_data_xattr(self, ino_no, xattr_name, data, pool=None):
+        """
+        Write to an xattr of the 0th data object of an inode.  Will
+        succeed whether the object and/or xattr already exist or not.
+
+        :param ino_no: integer inode number
+        :param xattr_name: string name of the xattr
+        :param data: byte array data to write to the xattr
+        :param pool: name of data pool or None to use primary data pool
+        :return: None
+        """
+        if pool is None:
+            pool = self.get_data_pool_name()
+
+        obj_name = "{0:x}.00000000".format(ino_no)
+        args = ["setxattr", obj_name, xattr_name, data]
+        self.rados(args, pool=pool)
+
+    def read_backtrace(self, ino_no, pool=None):
+        """
+        Read the backtrace from the data pool, return a dict in the format
+        given by inode_backtrace_t::dump, which is something like:
+
+        ::
+
+            rados -p cephfs_data getxattr 10000000002.00000000 parent > out.bin
+            ceph-dencoder type inode_backtrace_t import out.bin decode dump_json
+
+            { "ino": 1099511627778,
+              "ancestors": [
+                    { "dirino": 1,
+                      "dname": "blah",
+                      "version": 11}],
+              "pool": 1,
+              "old_pools": []}
+
+        :param pool: name of pool to read backtrace from.  If omitted, FS must have only
+                     one data pool and that will be used.
+        """
+        return self._read_data_xattr(ino_no, "parent", "inode_backtrace_t", pool)
+
+    def read_layout(self, ino_no, pool=None):
+        """
+        Read 'layout' xattr of an inode and parse the result, returning a dict like:
+        ::
+            {
+                "stripe_unit": 4194304,
+                "stripe_count": 1,
+                "object_size": 4194304,
+                "pool_id": 1,
+                "pool_ns": "",
+            }
+
+        :param pool: name of pool to read backtrace from.  If omitted, FS must have only
+                     one data pool and that will be used.
+        """
+        return self._read_data_xattr(ino_no, "layout", "file_layout_t", pool)
+
+    def _enumerate_data_objects(self, ino, size):
+        """
+        Get the list of expected data objects for a range, and the list of objects
+        that really exist.
+
+        :return a tuple of two lists of strings (expected, actual)
+        """
+        stripe_size = 1024 * 1024 * 4
+
+        size = max(stripe_size, size)
+
+        want_objects = [
+            "{0:x}.{1:08x}".format(ino, n)
+            for n in range(0, ((size - 1) // stripe_size) + 1)
+        ]
+
+        exist_objects = self.rados(["ls"], pool=self.get_data_pool_name(), stdout=StringIO()).stdout.getvalue().split("\n")
+
+        return want_objects, exist_objects
+
+    def data_objects_present(self, ino, size):
+        """
+        Check that *all* the expected data objects for an inode are present in the data pool
+        """
+
+        want_objects, exist_objects = self._enumerate_data_objects(ino, size)
+        missing = set(want_objects) - set(exist_objects)
+
+        if missing:
+            log.debug("Objects missing (ino {0}, size {1}): {2}".format(
+                ino, size, missing
+            ))
+            return False
+        else:
+            log.debug("All objects for ino {0} size {1} found".format(ino, size))
+            return True
+
+    def data_objects_absent(self, ino, size):
+        want_objects, exist_objects = self._enumerate_data_objects(ino, size)
+        present = set(want_objects) & set(exist_objects)
+
+        if present:
+            log.debug("Objects not absent (ino {0}, size {1}): {2}".format(
+                ino, size, present
+            ))
+            return False
+        else:
+            log.debug("All objects for ino {0} size {1} are absent".format(ino, size))
+            return True
+
+    def dirfrag_exists(self, ino, frag):
+        try:
+            self.radosm(["stat", "{0:x}.{1:08x}".format(ino, frag)])
+        except CommandFailedError:
+            return False
+        else:
+            return True
+
+    def list_dirfrag(self, dir_ino):
+        """
+        Read the named object and return the list of omap keys
+
+        :return a list of 0 or more strings
+        """
+
+        dirfrag_obj_name = "{0:x}.00000000".format(dir_ino)
+
+        try:
+            key_list_str = self.radosmo(["listomapkeys", dirfrag_obj_name], stdout=StringIO())
+        except CommandFailedError as e:
+            log.error(e.__str__())
+            raise ObjectNotFound(dirfrag_obj_name)
+
+        return key_list_str.strip().split("\n") if key_list_str else []
+
+    def get_meta_of_fs_file(self, dir_ino, obj_name, out):
+        """
+        get metadata from parent to verify the correctness of the data format encoded by the tool, cephfs-meta-injection.
+        warning : The splitting of directory is not considered here.
+        """
+
+        dirfrag_obj_name = "{0:x}.00000000".format(dir_ino)
+        try:
+            self.radosm(["getomapval", dirfrag_obj_name, obj_name+"_head", out])
+        except CommandFailedError as e:
+            log.error(e.__str__())
+            raise ObjectNotFound(dir_ino)
+
+    def erase_metadata_objects(self, prefix):
+        """
+        For all objects in the metadata pool matching the prefix,
+        erase them.
+
+        This O(N) with the number of objects in the pool, so only suitable
+        for use on toy test filesystems.
+        """
+        all_objects = self.radosmo(["ls"], stdout=StringIO()).strip().split("\n")
+        matching_objects = [o for o in all_objects if o.startswith(prefix)]
+        for o in matching_objects:
+            self.radosm(["rm", o])
+
+    def erase_mds_objects(self, rank):
+        """
+        Erase all the per-MDS objects for a particular rank.  This includes
+        inotable, sessiontable, journal
+        """
+
+        def obj_prefix(multiplier):
+            """
+            MDS object naming conventions like rank 1's
+            journal is at 201.***
+            """
+            return "%x." % (multiplier * 0x100 + rank)
+
+        # MDS_INO_LOG_OFFSET
+        self.erase_metadata_objects(obj_prefix(2))
+        # MDS_INO_LOG_BACKUP_OFFSET
+        self.erase_metadata_objects(obj_prefix(3))
+        # MDS_INO_LOG_POINTER_OFFSET
+        self.erase_metadata_objects(obj_prefix(4))
+        # MDSTables & SessionMap
+        self.erase_metadata_objects("mds{rank:d}_".format(rank=rank))
+
+    @property
+    def _prefix(self):
+        """
+        Override this to set a different
+        """
+        return ""
+
+    def _make_rank(self, rank):
+        return "{}:{}".format(self.name, rank)
+
+    def _run_tool(self, tool, args, rank=None, quiet=False):
+        # Tests frequently have [client] configuration that jacks up
+        # the objecter log level (unlikely to be interesting here)
+        # and does not set the mds log level (very interesting here)
+        if quiet:
+            base_args = [os.path.join(self._prefix, tool), '--debug-mds=1', '--debug-objecter=1']
+        else:
+            base_args = [os.path.join(self._prefix, tool), '--debug-mds=4', '--debug-objecter=1']
+
+        if rank is not None:
+            base_args.extend(["--rank", "%s" % str(rank)])
+
+        t1 = datetime.datetime.now()
+        r = self.tool_remote.sh(script=base_args + args, stdout=StringIO()).strip()
+        duration = datetime.datetime.now() - t1
+        log.debug("Ran {0} in time {1}, result:\n{2}".format(
+            base_args + args, duration, r
+        ))
+        return r
+
+    @property
+    def tool_remote(self):
+        """
+        An arbitrary remote to use when invoking recovery tools.  Use an MDS host because
+        it'll definitely have keys with perms to access cephfs metadata pool.  This is public
+        so that tests can use this remote to go get locally written output files from the tools.
+        """
+        return self.mon_manager.controller
+
+    def journal_tool(self, args, rank, quiet=False):
+        """
+        Invoke cephfs-journal-tool with the passed arguments for a rank, and return its stdout
+        """
+        fs_rank = self._make_rank(rank)
+        return self._run_tool("cephfs-journal-tool", args, fs_rank, quiet)
+
+    def meta_tool(self, args, rank, quiet=False):
+        """
+        Invoke cephfs-meta-injection with the passed arguments for a rank, and return its stdout
+        """
+        fs_rank = self._make_rank(rank)
+        return self._run_tool("cephfs-meta-injection", args, fs_rank, quiet)
+
+    def table_tool(self, args, quiet=False):
+        """
+        Invoke cephfs-table-tool with the passed arguments, and return its stdout
+        """
+        return self._run_tool("cephfs-table-tool", args, None, quiet)
+
+    def data_scan(self, args, quiet=False, worker_count=1):
+        """
+        Invoke cephfs-data-scan with the passed arguments, and return its stdout
+
+        :param worker_count: if greater than 1, multiple workers will be run
+                             in parallel and the return value will be None
+        """
+
+        workers = []
+
+        for n in range(0, worker_count):
+            if worker_count > 1:
+                # data-scan args first token is a command, followed by args to it.
+                # insert worker arguments after the command.
+                cmd = args[0]
+                worker_args = [cmd] + ["--worker_n", n.__str__(), "--worker_m", worker_count.__str__()] + args[1:]
+            else:
+                worker_args = args
+
+            workers.append(Greenlet.spawn(lambda wargs=worker_args:
+                                          self._run_tool("cephfs-data-scan", wargs, None, quiet)))
+
+        for w in workers:
+            w.get()
+
+        if worker_count == 1:
+            return workers[0].value
+        else:
+            return None
+
+    def is_full(self):
+        return self.is_pool_full(self.get_data_pool_name())
+
+    def authorize(self, client_id, caps=('/', 'rw')):
+        """
+        Run "ceph fs authorize" and run "ceph auth get" to get and returnt the
+        keyring.
+
+        client_id: client id that will be authorized
+        caps: tuple containing the path and permission (can be r or rw)
+              respectively.
+        """
+        client_name = 'client.' + client_id
+        return self.mon_manager.raw_cluster_cmd('fs', 'authorize', self.name,
+                                                client_name, *caps)
+
+    def grow(self, new_max_mds, status=None):
+        oldmax = self.get_var('max_mds', status=status)
+        assert(new_max_mds > oldmax)
+        self.set_max_mds(new_max_mds)
+        return self.wait_for_daemons()
+
+    def shrink(self, new_max_mds, status=None):
+        oldmax = self.get_var('max_mds', status=status)
+        assert(new_max_mds < oldmax)
+        self.set_max_mds(new_max_mds)
+        return self.wait_for_daemons()
+
+    def run_scrub(self, cmd, rank=0):
+        return self.rank_tell(["scrub"] + cmd, rank)
+
+    def get_scrub_status(self, rank=0):
+        return self.run_scrub(["status"], rank)
+
+    def wait_until_scrub_complete(self, result=None, tag=None, rank=0, sleep=30,
+                                  timeout=300, reverse=False):
+        # time out after "timeout" seconds and assume as done
+        if result is None:
+            result = "no active scrubs running"
+        with contextutil.safe_while(sleep=sleep, tries=timeout//sleep) as proceed:
+            while proceed():
+                out_json = self.rank_tell(["scrub", "status"], rank=rank)
+                assert out_json is not None
+                if not reverse:
+                    if result in out_json['status']:
+                        log.info("all active scrubs completed")
+                        return True
+                else:
+                    if result not in out_json['status']:
+                        log.info("all active scrubs completed")
+                        return True
+
+                if tag is not None:
+                    status = out_json['scrubs'][tag]
+                    if status is not None:
+                        log.info(f"scrub status for tag:{tag} - {status}")
+                    else:
+                        log.info(f"scrub has completed for tag:{tag}")
+                        return True
+
+        # timed out waiting for scrub to complete
+        return False
diff --git a/qa/tasks/cephfs/fuse_mount.py b/qa/tasks/cephfs/fuse_mount.py
new file mode 100644
index 000000000..5c5d1c85c
--- /dev/null
+++ b/qa/tasks/cephfs/fuse_mount.py
@@ -0,0 +1,516 @@
+import json
+import time
+import logging
+
+from io import StringIO
+from textwrap import dedent
+
+from teuthology.contextutil import MaxWhileTries
+from teuthology.contextutil import safe_while
+from teuthology.orchestra import run
+from teuthology.orchestra.run import CommandFailedError
+from tasks.ceph_manager import get_valgrind_args
+from tasks.cephfs.mount import CephFSMount
+
+log = logging.getLogger(__name__)
+
+# Refer mount.py for docstrings.
+class FuseMount(CephFSMount):
+    def __init__(self, ctx, client_config, test_dir, client_id,
+                 client_remote, client_keyring_path=None, cephfs_name=None,
+                 cephfs_mntpt=None, hostfs_mntpt=None, brxnet=None):
+        super(FuseMount, self).__init__(ctx=ctx, test_dir=test_dir,
+            client_id=client_id, client_remote=client_remote,
+            client_keyring_path=client_keyring_path, hostfs_mntpt=hostfs_mntpt,
+            cephfs_name=cephfs_name, cephfs_mntpt=cephfs_mntpt, brxnet=brxnet)
+
+        self.client_config = client_config if client_config else {}
+        self.fuse_daemon = None
+        self._fuse_conn = None
+        self.id = None
+        self.inst = None
+        self.addr = None
+
+    def mount(self, mntopts=[], createfs=True, check_status=True, **kwargs):
+        self.update_attrs(**kwargs)
+        self.assert_and_log_minimum_mount_details()
+
+        self.setup_netns()
+
+        if createfs:
+            # TODO: don't call setupfs() from within mount(), since it's
+            # absurd. The proper order should be: create FS first and then
+            # call mount().
+            self.setupfs(name=self.cephfs_name)
+
+        try:
+            return self._mount(mntopts, check_status)
+        except RuntimeError:
+            # Catch exceptions by the mount() logic (i.e. not remote command
+            # failures) and ensure the mount is not left half-up.
+            # Otherwise we might leave a zombie mount point that causes
+            # anyone traversing cephtest/ to get hung up on.
+            log.warning("Trying to clean up after failed mount")
+            self.umount_wait(force=True)
+            raise
+
+    def _mount(self, mntopts, check_status):
+        log.info("Client client.%s config is %s" % (self.client_id,
+                                                    self.client_config))
+
+        daemon_signal = 'kill'
+        if self.client_config.get('coverage') or \
+           self.client_config.get('valgrind') is not None:
+            daemon_signal = 'term'
+
+        # Use 0000 mode to prevent undesired modifications to the mountpoint on
+        # the local file system.
+        script = f'mkdir -m 0000 -p -v {self.hostfs_mntpt}'.split()
+        stderr = StringIO()
+        try:
+            self.client_remote.run(args=script, timeout=(15*60),
+                stderr=StringIO())
+        except CommandFailedError:
+            if 'file exists' not in stderr.getvalue().lower():
+                raise
+
+        run_cmd = [
+            'sudo',
+            'adjust-ulimits',
+            'ceph-coverage',
+            '{tdir}/archive/coverage'.format(tdir=self.test_dir),
+            'daemon-helper',
+            daemon_signal,
+        ]
+
+        fuse_cmd = [
+            'ceph-fuse', "-f",
+            "--admin-socket", "/var/run/ceph/$cluster-$name.$pid.asok",
+        ]
+        if self.client_id is not None:
+            fuse_cmd += ['--id', self.client_id]
+        if self.client_keyring_path and self.client_id is not None:
+            fuse_cmd += ['-k', self.client_keyring_path]
+        if self.cephfs_mntpt is not None:
+            fuse_cmd += ["--client_mountpoint=" + self.cephfs_mntpt]
+        if self.cephfs_name is not None:
+            fuse_cmd += ["--client_fs=" + self.cephfs_name]
+        if mntopts:
+            fuse_cmd += mntopts
+        fuse_cmd.append(self.hostfs_mntpt)
+
+        if self.client_config.get('valgrind') is not None:
+            run_cmd = get_valgrind_args(
+                self.test_dir,
+                'client.{id}'.format(id=self.client_id),
+                run_cmd,
+                self.client_config.get('valgrind'),
+                cd=False
+            )
+
+        netns_prefix = ['sudo', 'nsenter',
+                        '--net=/var/run/netns/{0}'.format(self.netns_name)]
+        run_cmd = netns_prefix + run_cmd
+
+        run_cmd.extend(fuse_cmd)
+
+        def list_connections():
+            conn_dir = "/sys/fs/fuse/connections"
+
+            self.client_remote.run(args=['sudo', 'modprobe', 'fuse'],
+                                   check_status=False)
+            self.client_remote.run(
+                args=["sudo", "mount", "-t", "fusectl", conn_dir, conn_dir],
+                check_status=False, timeout=(30))
+
+            try:
+                ls_str = self.client_remote.sh("ls " + conn_dir,
+                                               stdout=StringIO(),
+                                               timeout=(15*60)).strip()
+            except CommandFailedError:
+                return []
+
+            if ls_str:
+                return [int(n) for n in ls_str.split("\n")]
+            else:
+                return []
+
+        # Before starting ceph-fuse process, note the contents of
+        # /sys/fs/fuse/connections
+        pre_mount_conns = list_connections()
+        log.info("Pre-mount connections: {0}".format(pre_mount_conns))
+
+        mountcmd_stdout, mountcmd_stderr = StringIO(), StringIO()
+        self.fuse_daemon = self.client_remote.run(
+            args=run_cmd,
+            logger=log.getChild('ceph-fuse.{id}'.format(id=self.client_id)),
+            stdin=run.PIPE,
+            stdout=mountcmd_stdout,
+            stderr=mountcmd_stderr,
+            wait=False
+        )
+
+        # Wait for the connection reference to appear in /sys
+        mount_wait = self.client_config.get('mount_wait', 0)
+        if mount_wait > 0:
+            log.info("Fuse mount waits {0} seconds before checking /sys/".format(mount_wait))
+            time.sleep(mount_wait)            
+        timeout = int(self.client_config.get('mount_timeout', 30))
+        waited = 0
+
+        post_mount_conns = list_connections()
+        while len(post_mount_conns) <= len(pre_mount_conns):
+            if self.fuse_daemon.finished:
+                # Did mount fail?  Raise the CommandFailedError instead of
+                # hitting the "failed to populate /sys/" timeout
+                try:
+                    self.fuse_daemon.wait()
+                except CommandFailedError as e:
+                    log.info('mount command failed.')
+                    if check_status:
+                        raise
+                    else:
+                        return (e, mountcmd_stdout.getvalue(),
+                                mountcmd_stderr.getvalue())
+            time.sleep(1)
+            waited += 1
+            if waited > timeout:
+                raise RuntimeError(
+                    "Fuse mount failed to populate/sys/ after {} "
+                    "seconds".format(waited))
+            else:
+                post_mount_conns = list_connections()
+
+        log.info("Post-mount connections: {0}".format(post_mount_conns))
+
+        # Record our fuse connection number so that we can use it when
+        # forcing an unmount
+        new_conns = list(set(post_mount_conns) - set(pre_mount_conns))
+        if len(new_conns) == 0:
+            raise RuntimeError("New fuse connection directory not found ({0})".format(new_conns))
+        elif len(new_conns) > 1:
+            raise RuntimeError("Unexpectedly numerous fuse connections {0}".format(new_conns))
+        else:
+            self._fuse_conn = new_conns[0]
+
+        self.gather_mount_info()
+
+        self.mounted = True
+
+    def gather_mount_info(self):
+        status = self.admin_socket(['status'])
+        self.id = status['id']
+        self.client_pid = status['metadata']['pid']
+        try:
+            self.inst = status['inst_str']
+            self.addr = status['addr_str']
+        except KeyError:
+            sessions = self.fs.rank_asok(['session', 'ls'])
+            for s in sessions:
+                if s['id'] == self.id:
+                    self.inst = s['inst']
+                    self.addr = self.inst.split()[1]
+            if self.inst is None:
+                raise RuntimeError("cannot find client session")
+
+    def check_mounted_state(self):
+        proc = self.client_remote.run(
+            args=[
+                'stat',
+                '--file-system',
+                '--printf=%T\n',
+                '--',
+                self.hostfs_mntpt,
+            ],
+            stdout=StringIO(),
+            stderr=StringIO(),
+            wait=False,
+            timeout=(15*60)
+        )
+        try:
+            proc.wait()
+        except CommandFailedError:
+            error = proc.stderr.getvalue()
+            if ("endpoint is not connected" in error
+            or "Software caused connection abort" in error):
+                # This happens is fuse is killed without unmount
+                log.warning("Found stale mount point at {0}".format(self.hostfs_mntpt))
+                return True
+            else:
+                # This happens if the mount directory doesn't exist
+                log.info('mount point does not exist: %s', self.hostfs_mntpt)
+                return False
+
+        fstype = proc.stdout.getvalue().rstrip('\n')
+        if fstype == 'fuseblk':
+            log.info('ceph-fuse is mounted on %s', self.hostfs_mntpt)
+            return True
+        else:
+            log.debug('ceph-fuse not mounted, got fs type {fstype!r}'.format(
+                fstype=fstype))
+            return False
+
+    def wait_until_mounted(self):
+        """
+        Check to make sure that fuse is mounted on mountpoint.  If not,
+        sleep for 5 seconds and check again.
+        """
+
+        while not self.check_mounted_state():
+            # Even if it's not mounted, it should at least
+            # be running: catch simple failures where it has terminated.
+            assert not self.fuse_daemon.poll()
+
+            time.sleep(5)
+
+        self.mounted = True
+
+        # Now that we're mounted, set permissions so that the rest of the test
+        # will have unrestricted access to the filesystem mount.
+        for retry in range(10):
+            try:
+                stderr = StringIO()
+                self.client_remote.run(args=['sudo', 'chmod', '1777',
+                                             self.hostfs_mntpt],
+                                       timeout=(15*60),
+                                       stderr=stderr, omit_sudo=False)
+                break
+            except run.CommandFailedError:
+                stderr = stderr.getvalue().lower()
+                if "read-only file system" in stderr:
+                    break
+                elif "permission denied" in stderr:
+                    time.sleep(5)
+                else:
+                    raise
+
+    def _mountpoint_exists(self):
+        return self.client_remote.run(args=["ls", "-d", self.hostfs_mntpt], check_status=False, timeout=(15*60)).exitstatus == 0
+
+    def umount(self, cleanup=True):
+        """
+        umount() must not run cleanup() when it's called by umount_wait()
+        since "run.wait([self.fuse_daemon], timeout)" would hang otherwise.
+        """
+        if not self.is_mounted():
+            if cleanup:
+                self.cleanup()
+            return
+
+        try:
+            log.info('Running fusermount -u on {name}...'.format(name=self.client_remote.name))
+            stderr = StringIO()
+            self.client_remote.run(args=['sudo', 'fusermount', '-u',
+                                         self.hostfs_mntpt],
+                                   stderr=stderr,
+                                   timeout=(30*60), omit_sudo=False)
+        except run.CommandFailedError:
+            if "mountpoint not found" in stderr.getvalue():
+                # This happens if the mount directory doesn't exist
+                log.info('mount point does not exist: %s', self.mountpoint)
+            elif "not mounted" in stderr.getvalue():
+                # This happens if the mount directory already unmouted
+                log.info('mount point not mounted: %s', self.mountpoint)
+            else:
+                log.info('Failed to unmount ceph-fuse on {name}, aborting...'.format(name=self.client_remote.name))
+
+                self.client_remote.run(
+                    args=['sudo', run.Raw('PATH=/usr/sbin:$PATH'), 'lsof',
+                    run.Raw(';'), 'ps', 'auxf'],
+                    timeout=(60*15), omit_sudo=False)
+
+                # abort the fuse mount, killing all hung processes
+                if self._fuse_conn:
+                    self.run_python(dedent("""
+                    import os
+                    path = "/sys/fs/fuse/connections/{0}/abort"
+                    if os.path.exists(path):
+                        open(path, "w").write("1")
+                    """).format(self._fuse_conn))
+                    self._fuse_conn = None
+
+                stderr = StringIO()
+                # make sure its unmounted
+                try:
+                    self.client_remote.run(args=['sudo', 'umount', '-l', '-f',
+                                                 self.hostfs_mntpt],
+                                           stderr=stderr, timeout=(60*15), omit_sudo=False)
+                except CommandFailedError:
+                    if self.is_mounted():
+                        raise
+
+        self.mounted = False
+        self._fuse_conn = None
+        self.id = None
+        self.inst = None
+        self.addr = None
+        if cleanup:
+            self.cleanup()
+
+    def umount_wait(self, force=False, require_clean=False, timeout=900):
+        """
+        :param force: Complete cleanly even if the MDS is offline
+        """
+        if not (self.is_mounted() and self.fuse_daemon):
+            log.debug('ceph-fuse client.{id} is not mounted at {remote} '
+                      '{mnt}'.format(id=self.client_id,
+                                     remote=self.client_remote,
+                                     mnt=self.hostfs_mntpt))
+            self.cleanup()
+            return
+
+        if force:
+            assert not require_clean  # mutually exclusive
+
+            # When we expect to be forcing, kill the ceph-fuse process directly.
+            # This should avoid hitting the more aggressive fallback killing
+            # in umount() which can affect other mounts too.
+            self.fuse_daemon.stdin.close()
+
+            # However, we will still hit the aggressive wait if there is an ongoing
+            # mount -o remount (especially if the remount is stuck because MDSs
+            # are unavailable)
+
+        # cleanup is set to to fail since clieanup must happen after umount is
+        # complete; otherwise following call to run.wait hangs.
+        self.umount(cleanup=False)
+
+        try:
+            # Permit a timeout, so that we do not block forever
+            run.wait([self.fuse_daemon], timeout)
+
+        except MaxWhileTries:
+            log.error("process failed to terminate after unmount. This probably"
+                      " indicates a bug within ceph-fuse.")
+            raise
+        except CommandFailedError:
+            if require_clean:
+                raise
+
+        self.mounted = False
+        self.cleanup()
+
+    def teardown(self):
+        """
+        Whatever the state of the mount, get it gone.
+        """
+        super(FuseMount, self).teardown()
+
+        self.umount()
+
+        if self.fuse_daemon and not self.fuse_daemon.finished:
+            self.fuse_daemon.stdin.close()
+            try:
+                self.fuse_daemon.wait()
+            except CommandFailedError:
+                pass
+
+        self.mounted = False
+
+    def _asok_path(self):
+        return "/var/run/ceph/ceph-client.{0}.*.asok".format(self.client_id)
+
+    @property
+    def _prefix(self):
+        return ""
+
+    def find_admin_socket(self):
+        pyscript = """
+import glob
+import re
+import os
+import subprocess
+
+def _find_admin_socket(client_name):
+        asok_path = "{asok_path}"
+        files = glob.glob(asok_path)
+        mountpoint = "{mountpoint}"
+
+        # Given a non-glob path, it better be there
+        if "*" not in asok_path:
+            assert(len(files) == 1)
+            return files[0]
+
+        for f in files:
+                pid = re.match(".*\.(\d+)\.asok$", f).group(1)
+                if os.path.exists("/proc/{{0}}".format(pid)):
+                    with open("/proc/{{0}}/cmdline".format(pid), 'r') as proc_f:
+                        contents = proc_f.read()
+                        if mountpoint in contents:
+                            return f
+        raise RuntimeError("Client socket {{0}} not found".format(client_name))
+
+print(_find_admin_socket("{client_name}"))
+""".format(
+            asok_path=self._asok_path(),
+            client_name="client.{0}".format(self.client_id),
+            mountpoint=self.mountpoint)
+
+        asok_path = self.run_python(pyscript, sudo=True)
+        log.info("Found client admin socket at {0}".format(asok_path))
+        return asok_path
+
+    def admin_socket(self, args):
+        asok_path = self.find_admin_socket()
+
+        # Query client ID from admin socket, wait 2 seconds
+        # and retry 10 times if it is not ready
+        with safe_while(sleep=2, tries=10) as proceed:
+            while proceed():
+                try:
+                    p = self.client_remote.run(args=
+                        ['sudo', self._prefix + 'ceph', '--admin-daemon', asok_path] + args,
+                        stdout=StringIO(), stderr=StringIO(), wait=False,
+                        timeout=(15*60))
+                    p.wait()
+                    break
+                except CommandFailedError:
+                    if "connection refused" in p.stderr.getvalue().lower():
+                        pass
+
+        return json.loads(p.stdout.getvalue().strip())
+
+    def get_global_id(self):
+        """
+        Look up the CephFS client ID for this mount
+        """
+        return self.admin_socket(['mds_sessions'])['id']
+
+    def get_global_inst(self):
+        """
+        Look up the CephFS client instance for this mount
+        """
+        return self.inst
+
+    def get_global_addr(self):
+        """
+        Look up the CephFS client addr for this mount
+        """
+        return self.addr
+
+    def get_client_pid(self):
+        """
+        return pid of ceph-fuse process
+        """
+        status = self.admin_socket(['status'])
+        return status['metadata']['pid']
+
+    def get_osd_epoch(self):
+        """
+        Return 2-tuple of osd_epoch, osd_epoch_barrier
+        """
+        status = self.admin_socket(['status'])
+        return status['osd_epoch'], status['osd_epoch_barrier']
+
+    def get_dentry_count(self):
+        """
+        Return 2-tuple of dentry_count, dentry_pinned_count
+        """
+        status = self.admin_socket(['status'])
+        return status['dentry_count'], status['dentry_pinned_count']
+
+    def set_cache_size(self, size):
+        return self.admin_socket(['config', 'set', 'client_cache_size', str(size)])
+
+    def get_op_read_count(self):
+        return self.admin_socket(['perf', 'dump', 'objecter'])['objecter']['osdop_read']
diff --git a/qa/tasks/cephfs/kernel_mount.py b/qa/tasks/cephfs/kernel_mount.py
new file mode 100644
index 000000000..f4640e3fd
--- /dev/null
+++ b/qa/tasks/cephfs/kernel_mount.py
@@ -0,0 +1,324 @@
+import errno
+import json
+import logging
+import os
+import re
+
+from io import StringIO
+from textwrap import dedent
+
+from teuthology.orchestra.run import CommandFailedError
+from teuthology.orchestra import run
+from teuthology.contextutil import MaxWhileTries
+
+from tasks.cephfs.mount import CephFSMount
+
+log = logging.getLogger(__name__)
+
+
+UMOUNT_TIMEOUT = 300
+
+
+class KernelMount(CephFSMount):
+    def __init__(self, ctx, test_dir, client_id, client_remote,
+                 client_keyring_path=None, hostfs_mntpt=None,
+                 cephfs_name=None, cephfs_mntpt=None, brxnet=None, config={}):
+        super(KernelMount, self).__init__(ctx=ctx, test_dir=test_dir,
+            client_id=client_id, client_remote=client_remote,
+            client_keyring_path=client_keyring_path, hostfs_mntpt=hostfs_mntpt,
+            cephfs_name=cephfs_name, cephfs_mntpt=cephfs_mntpt, brxnet=brxnet)
+
+        self.rbytes = config.get('rbytes', False)
+        self.inst = None
+        self.addr = None
+
+    def mount(self, mntopts=[], createfs=True, check_status=True, **kwargs):
+        self.update_attrs(**kwargs)
+        self.assert_and_log_minimum_mount_details()
+
+        self.setup_netns()
+
+        # TODO: don't call setupfs() from within mount(), since it's
+        # absurd. The proper order should be: create FS first and then
+        # call mount().
+        if createfs:
+            self.setupfs(name=self.cephfs_name)
+        if not self.cephfs_mntpt:
+            self.cephfs_mntpt = '/'
+
+        stderr = StringIO()
+        try:
+            self.client_remote.run(args=['mkdir', '-p', self.hostfs_mntpt],
+                                   timeout=(5*60), stderr=stderr)
+        except CommandFailedError:
+            if 'file exists' not in stderr.getvalue().lower():
+                raise
+
+        retval = self._run_mount_cmd(mntopts, check_status)
+        if retval:
+            return retval
+
+        stderr = StringIO()
+        try:
+            self.client_remote.run(
+                args=['sudo', 'chmod', '1777', self.hostfs_mntpt],
+                stderr=stderr, timeout=(5*60))
+        except CommandFailedError:
+            # the client does not have write permissions in the caps it holds
+            # for the Ceph FS that was just mounted.
+            if 'permission denied' in stderr.getvalue().lower():
+                pass
+
+
+        self.mounted = True
+
+    def _run_mount_cmd(self, mntopts, check_status):
+        opts = 'norequire_active_mds'
+        if self.client_id:
+            opts += ',name=' + self.client_id
+        if self.client_keyring_path and self.client_id:
+            opts += ',secret=' + self.get_key_from_keyfile()
+        if self.config_path:
+            opts += ',conf=' + self.config_path
+        if self.cephfs_name:
+            opts += ",mds_namespace=" + self.cephfs_name
+        if self.rbytes:
+            opts += ",rbytes"
+        else:
+            opts += ",norbytes"
+        if mntopts:
+            opts += ',' + ','.join(mntopts)
+
+        mount_dev = ':' + self.cephfs_mntpt
+        prefix = ['sudo', 'adjust-ulimits', 'ceph-coverage',
+                  self.test_dir + '/archive/coverage',
+                  'nsenter',
+                  '--net=/var/run/netns/{0}'.format(self.netns_name)]
+        cmdargs = prefix + ['/bin/mount', '-t', 'ceph', mount_dev,
+                            self.hostfs_mntpt, '-v', '-o', opts]
+
+        mountcmd_stdout, mountcmd_stderr = StringIO(), StringIO()
+        try:
+            self.client_remote.run(args=cmdargs, timeout=(30*60),
+                                   stdout=mountcmd_stdout,
+                                   stderr=mountcmd_stderr)
+        except CommandFailedError as e:
+            log.info('mount command failed')
+            if check_status:
+                raise
+            else:
+                return (e, mountcmd_stdout.getvalue(),
+                        mountcmd_stderr.getvalue())
+        log.info('mount command passed')
+
+    def umount(self, force=False):
+        if not self.is_mounted():
+            self.cleanup()
+            return
+
+        log.debug('Unmounting client client.{id}...'.format(id=self.client_id))
+
+        try:
+            cmd=['sudo', 'umount', self.hostfs_mntpt]
+            if force:
+                cmd.append('-f')
+            self.client_remote.run(args=cmd, timeout=(15*60), omit_sudo=False)
+        except Exception as e:
+            self.client_remote.run(
+                args=['sudo', run.Raw('PATH=/usr/sbin:$PATH'), 'lsof',
+                      run.Raw(';'), 'ps', 'auxf'],
+                timeout=(15*60), omit_sudo=False)
+            raise e
+
+        self.mounted = False
+        self.cleanup()
+
+    def umount_wait(self, force=False, require_clean=False, timeout=900):
+        """
+        Unlike the fuse client, the kernel client's umount is immediate
+        """
+        if not self.is_mounted():
+            self.cleanup()
+            return
+
+        try:
+            self.umount(force)
+        except (CommandFailedError, MaxWhileTries):
+            if not force:
+                raise
+
+            # force delete the netns and umount
+            self.client_remote.run(args=['sudo', 'umount', '-f', '-l',
+                                         self.mountpoint],
+                                   timeout=(15*60), omit_sudo=False)
+
+            self.mounted = False
+            self.cleanup()
+
+    def wait_until_mounted(self):
+        """
+        Unlike the fuse client, the kernel client is up and running as soon
+        as the initial mount() function returns.
+        """
+        assert self.mounted
+
+    def teardown(self):
+        super(KernelMount, self).teardown()
+        if self.mounted:
+            self.umount()
+
+    def _get_debug_dir(self):
+        """
+        Get the debugfs folder for this mount
+        """
+
+        cluster_name = 'ceph'
+        fsid = self.ctx.ceph[cluster_name].fsid
+
+        global_id = self._get_global_id()
+
+        return os.path.join("/sys/kernel/debug/ceph/", f"{fsid}.client{global_id}")
+
+    def read_debug_file(self, filename):
+        """
+        Read the debug file "filename", return None if the file doesn't exist.
+        """
+
+        path = os.path.join(self._get_debug_dir(), filename)
+
+        stdout = StringIO()
+        stderr = StringIO()
+        try:
+            self.run_shell_payload(f"sudo dd if={path}", timeout=(5 * 60),
+                                   stdout=stdout, stderr=stderr)
+            return stdout.getvalue()
+        except CommandFailedError:
+            if 'no such file or directory' in stderr.getvalue().lower():
+                return errno.ENOENT
+            elif 'not a directory' in stderr.getvalue().lower():
+                return errno.ENOTDIR
+            elif 'permission denied' in stderr.getvalue().lower():
+                return errno.EACCES
+            raise
+
+    def _get_global_id(self):
+        try:
+            p = self.run_shell_payload("getfattr --only-values -n ceph.client_id .", stdout=StringIO())
+            v = p.stdout.getvalue()
+            prefix = "client"
+            assert v.startswith(prefix)
+            return int(v[len(prefix):])
+        except CommandFailedError:
+            # Probably this fallback can be deleted in a few releases when the kernel xattr is widely available.
+            log.debug("Falling back to messy global_id lookup via /sys...")
+
+            pyscript = dedent("""
+                import glob
+                import os
+                import json
+
+                def get_id_to_dir():
+                    result = {}
+                    for dir in glob.glob("/sys/kernel/debug/ceph/*"):
+                        mds_sessions_lines = open(os.path.join(dir, "mds_sessions")).readlines()
+                        global_id = mds_sessions_lines[0].split()[1].strip('"')
+                        client_id = mds_sessions_lines[1].split()[1].strip('"')
+                        result[client_id] = global_id
+                    return result
+                print(json.dumps(get_id_to_dir()))
+            """)
+
+            output = self.client_remote.sh([
+                'sudo', 'python3', '-c', pyscript
+            ], timeout=(5*60))
+            client_id_to_global_id = json.loads(output)
+
+            try:
+                return client_id_to_global_id[self.client_id]
+            except KeyError:
+                log.error("Client id '{0}' debug dir not found (clients seen were: {1})".format(
+                    self.client_id, ",".join(client_id_to_global_id.keys())
+                ))
+                raise
+
+    def get_global_id(self):
+        """
+        Look up the CephFS client ID for this mount, using debugfs.
+        """
+
+        assert self.mounted
+
+        return self._get_global_id()
+
+    @property
+    def _global_addr(self):
+        if self.addr is not None:
+            return self.addr
+
+        # The first line of the "status" file's output will be something
+        # like:
+        #   "instance: client.4297 (0)10.72.47.117:0/1148470933"
+        # What we need here is only the string "10.72.47.117:0/1148470933"
+        status = self.read_debug_file("status")
+        if status is None:
+            return None
+
+        instance = re.findall(r'instance:.*', status)[0]
+        self.addr = instance.split()[2].split(')')[1]
+        return self.addr;
+
+    @property
+    def _global_inst(self):
+        if self.inst is not None:
+            return self.inst
+
+        client_gid = "client%d" % self.get_global_id()
+        self.inst = " ".join([client_gid, self._global_addr])
+        return self.inst
+
+    def get_global_inst(self):
+        """
+        Look up the CephFS client instance for this mount
+        """
+        return self._global_inst
+
+    def get_global_addr(self):
+        """
+        Look up the CephFS client addr for this mount
+        """
+        return self._global_addr
+
+    def get_osd_epoch(self):
+        """
+        Return 2-tuple of osd_epoch, osd_epoch_barrier
+        """
+        osd_map = self.read_debug_file("osdmap")
+        assert osd_map
+
+        lines = osd_map.split("\n")
+        first_line_tokens = lines[0].split()
+        epoch, barrier = int(first_line_tokens[1]), int(first_line_tokens[3])
+
+        return epoch, barrier
+
+    def get_op_read_count(self):
+        stdout = StringIO()
+        stderr = StringIO()
+        try:
+            path = os.path.join(self._get_debug_dir(), "metrics/size")
+            self.run_shell(f"sudo stat {path}", stdout=stdout,
+                           stderr=stderr, cwd=None)
+            buf = self.read_debug_file("metrics/size")
+        except CommandFailedError:
+            if 'no such file or directory' in stderr.getvalue().lower() \
+                    or 'not a directory' in stderr.getvalue().lower():
+                try:
+                    path = os.path.join(self._get_debug_dir(), "metrics")
+                    self.run_shell(f"sudo stat {path}", stdout=stdout,
+                                   stderr=stderr, cwd=None)
+                    buf = self.read_debug_file("metrics")
+                except CommandFailedError:
+                    return errno.ENOENT
+            else:
+                return 0
+        return int(re.findall(r'read.*', buf)[0].split()[1])
diff --git a/qa/tasks/cephfs/mount.py b/qa/tasks/cephfs/mount.py
new file mode 100644
index 000000000..a24fd284a
--- /dev/null
+++ b/qa/tasks/cephfs/mount.py
@@ -0,0 +1,1329 @@
+import hashlib
+import json
+import logging
+import datetime
+import os
+import re
+import time
+
+from io import StringIO
+from contextlib import contextmanager
+from textwrap import dedent
+from IPy import IP
+
+from teuthology.contextutil import safe_while
+from teuthology.misc import get_file, write_file
+from teuthology.orchestra import run
+from teuthology.orchestra.run import CommandFailedError, ConnectionLostError, Raw
+
+from tasks.cephfs.filesystem import Filesystem
+
+log = logging.getLogger(__name__)
+
+class CephFSMount(object):
+    def __init__(self, ctx, test_dir, client_id, client_remote,
+                 client_keyring_path=None, hostfs_mntpt=None,
+                 cephfs_name=None, cephfs_mntpt=None, brxnet=None):
+        """
+        :param test_dir: Global teuthology test dir
+        :param client_id: Client ID, the 'foo' in client.foo
+        :param client_keyring_path: path to keyring for given client_id
+        :param client_remote: Remote instance for the host where client will
+                              run
+        :param hostfs_mntpt: Path to directory on the FS on which Ceph FS will
+                             be mounted
+        :param cephfs_name: Name of Ceph FS to be mounted
+        :param cephfs_mntpt: Path to directory inside Ceph FS that will be
+                             mounted as root
+        """
+        self.mounted = False
+        self.ctx = ctx
+        self.test_dir = test_dir
+
+        self._verify_attrs(client_id=client_id,
+                           client_keyring_path=client_keyring_path,
+                           hostfs_mntpt=hostfs_mntpt, cephfs_name=cephfs_name,
+                           cephfs_mntpt=cephfs_mntpt)
+
+        self.client_id = client_id
+        self.client_keyring_path = client_keyring_path
+        self.client_remote = client_remote
+        if hostfs_mntpt:
+            self.hostfs_mntpt = hostfs_mntpt
+            self.hostfs_mntpt_dirname = os.path.basename(self.hostfs_mntpt)
+        else:
+            self.hostfs_mntpt = os.path.join(self.test_dir, f'mnt.{self.client_id}')
+        self.cephfs_name = cephfs_name
+        self.cephfs_mntpt = cephfs_mntpt
+
+        self.fs = None
+
+        self._netns_name = None
+        self.nsid = -1
+        if brxnet is None:
+            self.ceph_brx_net = '192.168.0.0/16'
+        else:
+            self.ceph_brx_net = brxnet
+
+        self.test_files = ['a', 'b', 'c']
+
+        self.background_procs = []
+
+    # This will cleanup the stale netnses, which are from the
+    # last failed test cases.
+    @staticmethod
+    def cleanup_stale_netnses_and_bridge(remote):
+        p = remote.run(args=['ip', 'netns', 'list'],
+                       stdout=StringIO(), timeout=(5*60))
+        p = p.stdout.getvalue().strip()
+
+        # Get the netns name list
+        netns_list = re.findall(r'ceph-ns-[^()\s][-.\w]+[^():\s]', p)
+
+        # Remove the stale netnses
+        for ns in netns_list:
+            ns_name = ns.split()[0]
+            args = ['sudo', 'ip', 'netns', 'delete', '{0}'.format(ns_name)]
+            try:
+                remote.run(args=args, timeout=(5*60), omit_sudo=False)
+            except Exception:
+                pass
+
+        # Remove the stale 'ceph-brx'
+        try:
+            args = ['sudo', 'ip', 'link', 'delete', 'ceph-brx']
+            remote.run(args=args, timeout=(5*60), omit_sudo=False)
+        except Exception:
+            pass
+
+    def _parse_netns_name(self):
+        self._netns_name = '-'.join(["ceph-ns",
+                                     re.sub(r'/+', "-", self.mountpoint)])
+
+    @property
+    def mountpoint(self):
+        if self.hostfs_mntpt == None:
+            self.hostfs_mntpt = os.path.join(self.test_dir,
+                                             self.hostfs_mntpt_dirname)
+        return self.hostfs_mntpt
+
+    @mountpoint.setter
+    def mountpoint(self, path):
+        if not isinstance(path, str):
+            raise RuntimeError('path should be of str type.')
+        self._mountpoint = self.hostfs_mntpt = path
+
+    @property
+    def netns_name(self):
+        if self._netns_name == None:
+            self._parse_netns_name()
+        return self._netns_name
+
+    @netns_name.setter
+    def netns_name(self, name):
+        self._netns_name = name
+
+    def assert_and_log_minimum_mount_details(self):
+        """
+        Make sure we have minimum details required for mounting. Ideally, this
+        method should be called at the beginning of the mount method.
+        """
+        if not self.client_id or not self.client_remote or \
+           not self.hostfs_mntpt:
+            errmsg = ('Mounting CephFS requires that at least following '
+                      'details to be provided -\n'
+                      '1. the client ID,\n2. the mountpoint and\n'
+                      '3. the remote machine where CephFS will be mounted.\n')
+            raise RuntimeError(errmsg)
+
+        log.info('Mounting Ceph FS. Following are details of mount; remember '
+                 '"None" represents Python type None -')
+        log.info(f'self.client_remote.hostname = {self.client_remote.hostname}')
+        log.info(f'self.client.name = client.{self.client_id}')
+        log.info(f'self.hostfs_mntpt = {self.hostfs_mntpt}')
+        log.info(f'self.cephfs_name = {self.cephfs_name}')
+        log.info(f'self.cephfs_mntpt = {self.cephfs_mntpt}')
+        log.info(f'self.client_keyring_path = {self.client_keyring_path}')
+        if self.client_keyring_path:
+            log.info('keyring content -\n' +
+                     get_file(self.client_remote, self.client_keyring_path,
+                              sudo=True).decode())
+
+    def is_mounted(self):
+        return self.mounted
+
+    def setupfs(self, name=None):
+        if name is None and self.fs is not None:
+            # Previous mount existed, reuse the old name
+            name = self.fs.name
+        self.fs = Filesystem(self.ctx, name=name)
+        log.info('Wait for MDS to reach steady state...')
+        self.fs.wait_for_daemons()
+        log.info('Ready to start {}...'.format(type(self).__name__))
+
+    def _setup_brx_and_nat(self):
+        # The ip for ceph-brx should be
+        ip = IP(self.ceph_brx_net)[-2]
+        mask = self.ceph_brx_net.split('/')[1]
+        brd = IP(self.ceph_brx_net).broadcast()
+
+        brx = self.client_remote.run(args=['ip', 'addr'], stderr=StringIO(),
+                                     stdout=StringIO(), timeout=(5*60))
+        brx = re.findall(r'inet .* ceph-brx', brx.stdout.getvalue())
+        if brx:
+            # If the 'ceph-brx' already exists, then check whether
+            # the new net is conflicting with it
+            _ip, _mask = brx[0].split()[1].split('/', 1)
+            if _ip != "{}".format(ip) or _mask != mask:
+                raise RuntimeError("Conflict with existing ceph-brx {0}, new {1}/{2}".format(brx[0].split()[1], ip, mask))
+
+        # Setup the ceph-brx and always use the last valid IP
+        if not brx:
+            log.info("Setuping the 'ceph-brx' with {0}/{1}".format(ip, mask))
+
+            self.run_shell_payload(f"""
+                set -e
+                sudo ip link add name ceph-brx type bridge
+                sudo ip addr flush dev ceph-brx
+                sudo ip link set ceph-brx up
+                sudo ip addr add {ip}/{mask} brd {brd} dev ceph-brx
+            """, timeout=(5*60), omit_sudo=False, cwd='/')
+        
+        args = "echo 1 | sudo tee /proc/sys/net/ipv4/ip_forward"
+        self.client_remote.run(args=args, timeout=(5*60), omit_sudo=False)
+        
+        # Setup the NAT
+        p = self.client_remote.run(args=['route'], stderr=StringIO(),
+                                   stdout=StringIO(), timeout=(5*60))
+        p = re.findall(r'default .*', p.stdout.getvalue())
+        if p == False:
+            raise RuntimeError("No default gw found")
+        gw = p[0].split()[7]
+
+        self.run_shell_payload(f"""
+            set -e
+            sudo iptables -A FORWARD -o {gw} -i ceph-brx -j ACCEPT
+            sudo iptables -A FORWARD -i {gw} -o ceph-brx -j ACCEPT
+            sudo iptables -t nat -A POSTROUTING -s {ip}/{mask} -o {gw} -j MASQUERADE
+        """, timeout=(5*60), omit_sudo=False, cwd='/')
+
+    def _setup_netns(self):
+        p = self.client_remote.run(args=['ip', 'netns', 'list'],
+                                   stderr=StringIO(), stdout=StringIO(),
+                                   timeout=(5*60)).stdout.getvalue().strip()
+
+        # Get the netns name list
+        netns_list = re.findall(r'[^()\s][-.\w]+[^():\s]', p)
+
+        out = re.search(r"{0}".format(self.netns_name), p)
+        if out is None:
+            # Get an uniq nsid for the new netns
+            nsid = 0
+            p = self.client_remote.run(args=['ip', 'netns', 'list-id'],
+                                       stderr=StringIO(), stdout=StringIO(),
+                                       timeout=(5*60)).stdout.getvalue()
+            while True:
+                out = re.search(r"nsid {} ".format(nsid), p)
+                if out is None:
+                    break
+
+                nsid += 1
+
+            # Add one new netns and set it id
+            self.run_shell_payload(f"""
+                set -e
+                sudo ip netns add {self.netns_name}
+                sudo ip netns set {self.netns_name} {nsid}
+            """, timeout=(5*60), omit_sudo=False, cwd='/')
+            self.nsid = nsid;
+        else:
+            # The netns already exists and maybe suspended by self.kill()
+            self.resume_netns();
+
+            nsid = int(re.search(r"{0} \(id: (\d+)\)".format(self.netns_name), p).group(1))
+            self.nsid = nsid;
+            return
+
+        # Get one ip address for netns
+        ips = IP(self.ceph_brx_net)
+        for ip in ips:
+            found = False
+            if ip == ips[0]:
+                continue
+            if ip == ips[-2]:
+                raise RuntimeError("we have ran out of the ip addresses")
+
+            for ns in netns_list:
+                ns_name = ns.split()[0]
+                args = ['sudo', 'ip', 'netns', 'exec', '{0}'.format(ns_name), 'ip', 'addr']
+                try:
+                    p = self.client_remote.run(args=args, stderr=StringIO(),
+                                               stdout=StringIO(), timeout=(5*60),
+                                               omit_sudo=False)
+                    q = re.search("{0}".format(ip), p.stdout.getvalue())
+                    if q is not None:
+                        found = True
+                        break
+                except CommandFailedError:
+                    if "No such file or directory" in p.stderr.getvalue():
+                        pass
+                    if "Invalid argument" in p.stderr.getvalue():
+                        pass
+
+            if found == False:
+                break
+
+        mask = self.ceph_brx_net.split('/')[1]
+        brd = IP(self.ceph_brx_net).broadcast()
+
+        log.info("Setuping the netns '{0}' with {1}/{2}".format(self.netns_name, ip, mask))
+
+        # Setup the veth interfaces
+        brxip = IP(self.ceph_brx_net)[-2]
+        self.run_shell_payload(f"""
+            set -e
+            sudo ip link add veth0 netns {self.netns_name} type veth peer name brx.{nsid}
+            sudo ip netns exec {self.netns_name} ip addr add {ip}/{mask} brd {brd} dev veth0
+            sudo ip netns exec {self.netns_name} ip link set veth0 up
+            sudo ip netns exec {self.netns_name} ip link set lo up
+            sudo ip netns exec {self.netns_name} ip route add default via {brxip}
+        """, timeout=(5*60), omit_sudo=False, cwd='/')
+
+        # Bring up the brx interface and join it to 'ceph-brx'
+        self.run_shell_payload(f"""
+            set -e
+            sudo ip link set brx.{nsid} up
+            sudo ip link set dev brx.{nsid} master ceph-brx
+        """, timeout=(5*60), omit_sudo=False, cwd='/')
+
+    def _cleanup_netns(self):
+        if self.nsid == -1:
+            return
+        log.info("Removing the netns '{0}'".format(self.netns_name))
+
+        # Delete the netns and the peer veth interface
+        self.run_shell_payload(f"""
+            set -e
+            sudo ip link set brx.{self.nsid} down
+            sudo ip link delete dev brx.{self.nsid}
+            sudo ip netns delete {self.netns_name}
+        """, timeout=(5*60), omit_sudo=False, cwd='/')
+
+        self.nsid = -1
+
+    def _cleanup_brx_and_nat(self):
+        brx = self.client_remote.run(args=['ip', 'addr'], stderr=StringIO(),
+                                     stdout=StringIO(), timeout=(5*60))
+        brx = re.findall(r'inet .* ceph-brx', brx.stdout.getvalue())
+        if not brx:
+            return
+
+        # If we are the last netns, will delete the ceph-brx
+        args = ['sudo', 'ip', 'link', 'show']
+        p = self.client_remote.run(args=args, stdout=StringIO(),
+                                   timeout=(5*60), omit_sudo=False)
+        _list = re.findall(r'brx\.', p.stdout.getvalue().strip())
+        if len(_list) != 0:
+            return
+
+        log.info("Removing the 'ceph-brx'")
+
+        self.run_shell_payload("""
+            set -e
+            sudo ip link set ceph-brx down
+            sudo ip link delete ceph-brx
+        """, timeout=(5*60), omit_sudo=False, cwd='/')
+
+        # Drop the iptables NAT rules
+        ip = IP(self.ceph_brx_net)[-2]
+        mask = self.ceph_brx_net.split('/')[1]
+
+        p = self.client_remote.run(args=['route'], stderr=StringIO(),
+                                   stdout=StringIO(), timeout=(5*60))
+        p = re.findall(r'default .*', p.stdout.getvalue())
+        if p == False:
+            raise RuntimeError("No default gw found")
+        gw = p[0].split()[7]
+        self.run_shell_payload(f"""
+            set -e
+            sudo iptables -D FORWARD -o {gw} -i ceph-brx -j ACCEPT
+            sudo iptables -D FORWARD -i {gw} -o ceph-brx -j ACCEPT
+            sudo iptables -t nat -D POSTROUTING -s {ip}/{mask} -o {gw} -j MASQUERADE
+        """, timeout=(5*60), omit_sudo=False, cwd='/')
+
+    def setup_netns(self):
+        """
+        Setup the netns for the mountpoint.
+        """
+        log.info("Setting the '{0}' netns for '{1}'".format(self._netns_name, self.mountpoint))
+        self._setup_brx_and_nat()
+        self._setup_netns()
+
+    def cleanup_netns(self):
+        """
+        Cleanup the netns for the mountpoint.
+        """
+        # We will defer cleaning the netnses and bridge until the last
+        # mountpoint is unmounted, this will be a temporary work around
+        # for issue#46282.
+
+        # log.info("Cleaning the '{0}' netns for '{1}'".format(self._netns_name, self.mountpoint))
+        # self._cleanup_netns()
+        # self._cleanup_brx_and_nat()
+
+    def suspend_netns(self):
+        """
+        Suspend the netns veth interface.
+        """
+        if self.nsid == -1:
+            return
+
+        log.info("Suspending the '{0}' netns for '{1}'".format(self._netns_name, self.mountpoint))
+
+        args = ['sudo', 'ip', 'link', 'set', 'brx.{0}'.format(self.nsid), 'down']
+        self.client_remote.run(args=args, timeout=(5*60), omit_sudo=False)
+
+    def resume_netns(self):
+        """
+        Resume the netns veth interface.
+        """
+        if self.nsid == -1:
+            return
+
+        log.info("Resuming the '{0}' netns for '{1}'".format(self._netns_name, self.mountpoint))
+
+        args = ['sudo', 'ip', 'link', 'set', 'brx.{0}'.format(self.nsid), 'up']
+        self.client_remote.run(args=args, timeout=(5*60), omit_sudo=False)
+
+    def mount(self, mntopts=[], createfs=True, check_status=True, **kwargs):
+        """
+        kwargs expects its members to be same as the arguments accepted by
+        self.update_attrs().
+        """
+        raise NotImplementedError()
+
+    def mount_wait(self, **kwargs):
+        """
+        Accepts arguments same as self.mount().
+        """
+        self.mount(**kwargs)
+        self.wait_until_mounted()
+
+    def umount(self):
+        raise NotImplementedError()
+
+    def umount_wait(self, force=False, require_clean=False, timeout=None):
+        """
+
+        :param force: Expect that the mount will not shutdown cleanly: kill
+                      it hard.
+        :param require_clean: Wait for the Ceph client associated with the
+                              mount (e.g. ceph-fuse) to terminate, and
+                              raise if it doesn't do so cleanly.
+        :param timeout: amount of time to be waited for umount command to finish
+        :return:
+        """
+        raise NotImplementedError()
+
+    def _verify_attrs(self, **kwargs):
+        """
+        Verify that client_id, client_keyring_path, client_remote, hostfs_mntpt,
+        cephfs_name, cephfs_mntpt are either type str or None.
+        """
+        for k, v in kwargs.items():
+            if v is not None and not isinstance(v, str):
+                raise RuntimeError('value of attributes should be either str '
+                                   f'or None. {k} - {v}')
+
+    def update_attrs(self, client_id=None, client_keyring_path=None,
+                     client_remote=None, hostfs_mntpt=None, cephfs_name=None,
+                     cephfs_mntpt=None):
+        if not (client_id or client_keyring_path or client_remote or
+                cephfs_name or cephfs_mntpt or hostfs_mntpt):
+            return
+
+        self._verify_attrs(client_id=client_id,
+                           client_keyring_path=client_keyring_path,
+                           hostfs_mntpt=hostfs_mntpt, cephfs_name=cephfs_name,
+                           cephfs_mntpt=cephfs_mntpt)
+
+        if client_id:
+            self.client_id = client_id
+        if client_keyring_path:
+            self.client_keyring_path = client_keyring_path
+        if client_remote:
+            self.client_remote = client_remote
+        if hostfs_mntpt:
+            self.hostfs_mntpt = hostfs_mntpt
+        if cephfs_name:
+            self.cephfs_name = cephfs_name
+        if cephfs_mntpt:
+            self.cephfs_mntpt = cephfs_mntpt
+
+    def remount(self, **kwargs):
+        """
+        Update mount object's attributes and attempt remount with these
+        new values for these attrbiutes.
+
+        1. Run umount_wait().
+        2. Run update_attrs().
+        3. Run mount().
+
+        Accepts arguments of self.mount() and self.update_attrs() with 2 exceptions -
+        1. Accepts wait too which can be True or False.
+        2. The default value of createfs is False.
+        """
+        self.umount_wait()
+        assert not self.mounted
+
+        mntopts = kwargs.pop('mntopts', [])
+        createfs = kwargs.pop('createfs', False)
+        check_status = kwargs.pop('check_status', True)
+        wait = kwargs.pop('wait', True)
+
+        self.update_attrs(**kwargs)
+
+        retval = self.mount(mntopts=mntopts, createfs=createfs,
+                            check_status=check_status)
+        # avoid this scenario (again): mount command might've failed and
+        # check_status might have silenced the exception, yet we attempt to
+        # wait which might lead to an error.
+        if retval is None and wait:
+            self.wait_until_mounted()
+
+        return retval
+
+    def kill(self):
+        """
+        Suspend the netns veth interface to make the client disconnected
+        from the ceph cluster
+        """
+        log.info('Killing connection on {0}...'.format(self.client_remote.name))
+        self.suspend_netns()
+
+    def kill_cleanup(self):
+        """
+        Follow up ``kill`` to get to a clean unmounted state.
+        """
+        log.info('Cleaning up killed connection on {0}'.format(self.client_remote.name))
+        self.umount_wait(force=True)
+
+    def cleanup(self):
+        """
+        Remove the mount point.
+
+        Prerequisite: the client is not mounted.
+        """
+        log.info('Cleaning up mount {0}'.format(self.client_remote.name))
+        stderr = StringIO()
+        try:
+            self.client_remote.run(args=['rmdir', '--', self.mountpoint],
+                                   cwd=self.test_dir, stderr=stderr,
+                                   timeout=(60*5), check_status=False)
+        except CommandFailedError:
+            if "no such file or directory" not in stderr.getvalue().lower():
+                raise
+
+        self.cleanup_netns()
+
+    def wait_until_mounted(self):
+        raise NotImplementedError()
+
+    def get_keyring_path(self):
+        # N.B.: default keyring is /etc/ceph/ceph.keyring; see ceph.py and generate_caps
+        return '/etc/ceph/ceph.client.{id}.keyring'.format(id=self.client_id)
+
+    def get_key_from_keyfile(self):
+        # XXX: don't call run_shell(), since CephFS might be unmounted.
+        keyring = self.client_remote.run(
+            args=['sudo', 'cat', self.client_keyring_path], stdout=StringIO(),
+            omit_sudo=False).stdout.getvalue()
+        for line in keyring.split('\n'):
+            if line.find('key') != -1:
+                return line[line.find('=') + 1 : ].strip()
+
+    @property
+    def config_path(self):
+        """
+        Path to ceph.conf: override this if you're not a normal systemwide ceph install
+        :return: stringv
+        """
+        return "/etc/ceph/ceph.conf"
+
+    @contextmanager
+    def mounted_wait(self):
+        """
+        A context manager, from an initially unmounted state, to mount
+        this, yield, and then unmount and clean up.
+        """
+        self.mount()
+        self.wait_until_mounted()
+        try:
+            yield
+        finally:
+            self.umount_wait()
+
+    def create_file(self, filename='testfile', dirname=None, user=None,
+                    check_status=True):
+        assert(self.is_mounted())
+
+        if not os.path.isabs(filename):
+            if dirname:
+                if os.path.isabs(dirname):
+                    path = os.path.join(dirname, filename)
+                else:
+                    path = os.path.join(self.hostfs_mntpt, dirname, filename)
+            else:
+                path = os.path.join(self.hostfs_mntpt, filename)
+        else:
+            path = filename
+
+        if user:
+            args = ['sudo', '-u', user, '-s', '/bin/bash', '-c', 'touch ' + path]
+        else:
+            args = 'touch ' + path
+
+        return self.client_remote.run(args=args, check_status=check_status)
+
+    def create_files(self):
+        assert(self.is_mounted())
+
+        for suffix in self.test_files:
+            log.info("Creating file {0}".format(suffix))
+            self.client_remote.run(args=[
+                'touch', os.path.join(self.hostfs_mntpt, suffix)
+            ])
+
+    def test_create_file(self, filename='testfile', dirname=None, user=None,
+                         check_status=True):
+        return self.create_file(filename=filename, dirname=dirname, user=user,
+                                check_status=False)
+
+    def check_files(self):
+        assert(self.is_mounted())
+
+        for suffix in self.test_files:
+            log.info("Checking file {0}".format(suffix))
+            r = self.client_remote.run(args=[
+                'ls', os.path.join(self.hostfs_mntpt, suffix)
+            ], check_status=False)
+            if r.exitstatus != 0:
+                raise RuntimeError("Expected file {0} not found".format(suffix))
+
+    def write_file(self, path, data, perms=None):
+        """
+        Write the given data at the given path and set the given perms to the
+        file on the path.
+        """
+        if path.find(self.hostfs_mntpt) == -1:
+            path = os.path.join(self.hostfs_mntpt, path)
+
+        write_file(self.client_remote, path, data)
+
+        if perms:
+            self.run_shell(args=f'chmod {perms} {path}')
+
+    def read_file(self, path):
+        """
+        Return the data from the file on given path.
+        """
+        if path.find(self.hostfs_mntpt) == -1:
+            path = os.path.join(self.hostfs_mntpt, path)
+
+        return self.run_shell(args=['cat', path]).\
+            stdout.getvalue().strip()
+
+    def create_destroy(self):
+        assert(self.is_mounted())
+
+        filename = "{0} {1}".format(datetime.datetime.now(), self.client_id)
+        log.debug("Creating test file {0}".format(filename))
+        self.client_remote.run(args=[
+            'touch', os.path.join(self.hostfs_mntpt, filename)
+        ])
+        log.debug("Deleting test file {0}".format(filename))
+        self.client_remote.run(args=[
+            'rm', '-f', os.path.join(self.hostfs_mntpt, filename)
+        ])
+
+    def _run_python(self, pyscript, py_version='python3', sudo=False):
+        args = []
+        if sudo:
+            args.append('sudo')
+        args += ['adjust-ulimits', 'daemon-helper', 'kill', py_version, '-c', pyscript]
+        return self.client_remote.run(args=args, wait=False, stdin=run.PIPE, stdout=StringIO())
+
+    def run_python(self, pyscript, py_version='python3', sudo=False):
+        p = self._run_python(pyscript, py_version, sudo=sudo)
+        p.wait()
+        return p.stdout.getvalue().strip()
+
+    def run_shell(self, args, timeout=900, **kwargs):
+        args = args.split() if isinstance(args, str) else args
+        omit_sudo = kwargs.pop('omit_sudo', False)
+        sudo = kwargs.pop('sudo', False)
+        cwd = kwargs.pop('cwd', self.mountpoint)
+        stdout = kwargs.pop('stdout', StringIO())
+        stderr = kwargs.pop('stderr', StringIO())
+
+        if sudo:
+            args.insert(0, 'sudo')
+
+        return self.client_remote.run(args=args, cwd=cwd, timeout=timeout,
+                                      stdout=stdout, stderr=stderr,
+                                      omit_sudo=omit_sudo, **kwargs)
+
+    def run_shell_payload(self, payload, **kwargs):
+        return self.run_shell(["bash", "-c", Raw(f"'{payload}'")], **kwargs)
+
+    def run_as_user(self, **kwargs):
+        """
+        Besides the arguments defined for run_shell() this method also
+        accepts argument 'user'.
+        """
+        args = kwargs.pop('args')
+        user = kwargs.pop('user')
+        if isinstance(args, str):
+            args = ['sudo', '-u', user, '-s', '/bin/bash', '-c', args]
+        elif isinstance(args, list):
+            cmdlist = args
+            cmd = ''
+            for i in cmdlist:
+                cmd = cmd + i + ' '
+            # get rid of extra space at the end.
+            cmd = cmd[:-1]
+
+            args = ['sudo', '-u', user, '-s', '/bin/bash', '-c', cmd]
+
+        kwargs['args'] = args
+        return self.run_shell(**kwargs)
+
+    def run_as_root(self, **kwargs):
+        """
+        Accepts same arguments as run_shell().
+        """
+        kwargs['user'] = 'root'
+        return self.run_as_user(**kwargs)
+
+    def _verify(self, proc, retval=None, errmsg=None):
+        if retval:
+            msg = ('expected return value: {}\nreceived return value: '
+                   '{}\n'.format(retval, proc.returncode))
+            assert proc.returncode == retval, msg
+
+        if errmsg:
+            stderr = proc.stderr.getvalue().lower()
+            msg = ('didn\'t find given string in stderr -\nexpected string: '
+                   '{}\nreceived error message: {}\nnote: received error '
+                   'message is converted to lowercase'.format(errmsg, stderr))
+            assert errmsg in stderr, msg
+
+    def negtestcmd(self, args, retval=None, errmsg=None, stdin=None,
+                   cwd=None, wait=True):
+        """
+        Conduct a negative test for the given command.
+
+        retval and errmsg are parameters to confirm the cause of command
+        failure.
+        """
+        proc = self.run_shell(args=args, wait=wait, stdin=stdin, cwd=cwd,
+                              check_status=False)
+        self._verify(proc, retval, errmsg)
+        return proc
+
+    def negtestcmd_as_user(self, args, user, retval=None, errmsg=None,
+                           stdin=None, cwd=None, wait=True):
+        proc = self.run_as_user(args=args, user=user, wait=wait, stdin=stdin,
+                                cwd=cwd, check_status=False)
+        self._verify(proc, retval, errmsg)
+        return proc
+
+    def negtestcmd_as_root(self, args, retval=None, errmsg=None, stdin=None,
+                           cwd=None, wait=True):
+        proc = self.run_as_root(args=args, wait=wait, stdin=stdin, cwd=cwd,
+                                check_status=False)
+        self._verify(proc, retval, errmsg)
+        return proc
+
+    def open_no_data(self, basename):
+        """
+        A pure metadata operation
+        """
+        assert(self.is_mounted())
+
+        path = os.path.join(self.hostfs_mntpt, basename)
+
+        p = self._run_python(dedent(
+            """
+            f = open("{path}", 'w')
+            """.format(path=path)
+        ))
+        p.wait()
+
+    def open_background(self, basename="background_file", write=True):
+        """
+        Open a file for writing, then block such that the client
+        will hold a capability.
+
+        Don't return until the remote process has got as far as opening
+        the file, then return the RemoteProcess instance.
+        """
+        assert(self.is_mounted())
+
+        path = os.path.join(self.hostfs_mntpt, basename)
+
+        if write:
+            pyscript = dedent("""
+                import time
+
+                with open("{path}", 'w') as f:
+                    f.write('content')
+                    f.flush()
+                    f.write('content2')
+                    while True:
+                        time.sleep(1)
+                """).format(path=path)
+        else:
+            pyscript = dedent("""
+                import time
+
+                with open("{path}", 'r') as f:
+                    while True:
+                        time.sleep(1)
+                """).format(path=path)
+
+        rproc = self._run_python(pyscript)
+        self.background_procs.append(rproc)
+
+        # This wait would not be sufficient if the file had already
+        # existed, but it's simple and in practice users of open_background
+        # are not using it on existing files.
+        self.wait_for_visible(basename)
+
+        return rproc
+
+    def open_dir_background(self, basename):
+        """
+        Create and hold a capability to a directory.
+        """
+        assert(self.is_mounted())
+
+        path = os.path.join(self.hostfs_mntpt, basename)
+
+        pyscript = dedent("""
+            import time
+            import os
+
+            os.mkdir("{path}")
+            fd = os.open("{path}", os.O_RDONLY)
+            while True:
+                time.sleep(1)
+            """).format(path=path)
+
+        rproc = self._run_python(pyscript)
+        self.background_procs.append(rproc)
+
+        self.wait_for_visible(basename)
+
+        return rproc
+
+    def wait_for_dir_empty(self, dirname, timeout=30):
+        dirpath = os.path.join(self.hostfs_mntpt, dirname)
+        with safe_while(sleep=5, tries=(timeout//5)) as proceed:
+            while proceed():
+                p = self.run_shell_payload(f"stat -c %h {dirpath}")
+                nr_links = int(p.stdout.getvalue().strip())
+                if nr_links == 2:
+                    return
+
+    def wait_for_visible(self, basename="background_file", timeout=30):
+        i = 0
+        while i < timeout:
+            r = self.client_remote.run(args=[
+                'stat', os.path.join(self.hostfs_mntpt, basename)
+            ], check_status=False)
+            if r.exitstatus == 0:
+                log.debug("File {0} became visible from {1} after {2}s".format(
+                    basename, self.client_id, i))
+                return
+            else:
+                time.sleep(1)
+                i += 1
+
+        raise RuntimeError("Timed out after {0}s waiting for {1} to become visible from {2}".format(
+            i, basename, self.client_id))
+
+    def lock_background(self, basename="background_file", do_flock=True):
+        """
+        Open and lock a files for writing, hold the lock in a background process
+        """
+        assert(self.is_mounted())
+
+        path = os.path.join(self.hostfs_mntpt, basename)
+
+        script_builder = """
+            import time
+            import fcntl
+            import struct"""
+        if do_flock:
+            script_builder += """
+            f1 = open("{path}-1", 'w')
+            fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB)"""
+        script_builder += """
+            f2 = open("{path}-2", 'w')
+            lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
+            fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
+            while True:
+                time.sleep(1)
+            """
+
+        pyscript = dedent(script_builder).format(path=path)
+
+        log.info("lock_background file {0}".format(basename))
+        rproc = self._run_python(pyscript)
+        self.background_procs.append(rproc)
+        return rproc
+
+    def lock_and_release(self, basename="background_file"):
+        assert(self.is_mounted())
+
+        path = os.path.join(self.hostfs_mntpt, basename)
+
+        script = """
+            import time
+            import fcntl
+            import struct
+            f1 = open("{path}-1", 'w')
+            fcntl.flock(f1, fcntl.LOCK_EX)
+            f2 = open("{path}-2", 'w')
+            lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
+            fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
+            """
+        pyscript = dedent(script).format(path=path)
+
+        log.info("lock_and_release file {0}".format(basename))
+        return self._run_python(pyscript)
+
+    def check_filelock(self, basename="background_file", do_flock=True):
+        assert(self.is_mounted())
+
+        path = os.path.join(self.hostfs_mntpt, basename)
+
+        script_builder = """
+            import fcntl
+            import errno
+            import struct"""
+        if do_flock:
+            script_builder += """
+            f1 = open("{path}-1", 'r')
+            try:
+                fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB)
+            except IOError as e:
+                if e.errno == errno.EAGAIN:
+                    pass
+            else:
+                raise RuntimeError("flock on file {path}-1 not found")"""
+        script_builder += """
+            f2 = open("{path}-2", 'r')
+            try:
+                lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
+                fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
+            except IOError as e:
+                if e.errno == errno.EAGAIN:
+                    pass
+            else:
+                raise RuntimeError("posix lock on file {path}-2 not found")
+            """
+        pyscript = dedent(script_builder).format(path=path)
+
+        log.info("check lock on file {0}".format(basename))
+        self.client_remote.run(args=[
+            'python3', '-c', pyscript
+        ])
+
+    def write_background(self, basename="background_file", loop=False):
+        """
+        Open a file for writing, complete as soon as you can
+        :param basename:
+        :return:
+        """
+        assert(self.is_mounted())
+
+        path = os.path.join(self.hostfs_mntpt, basename)
+
+        pyscript = dedent("""
+            import os
+            import time
+
+            fd = os.open("{path}", os.O_RDWR | os.O_CREAT, 0o644)
+            try:
+                while True:
+                    os.write(fd, b'content')
+                    time.sleep(1)
+                    if not {loop}:
+                        break
+            except IOError as e:
+                pass
+            os.close(fd)
+            """).format(path=path, loop=str(loop))
+
+        rproc = self._run_python(pyscript)
+        self.background_procs.append(rproc)
+        return rproc
+
+    def write_n_mb(self, filename, n_mb, seek=0, wait=True):
+        """
+        Write the requested number of megabytes to a file
+        """
+        assert(self.is_mounted())
+
+        return self.run_shell(["dd", "if=/dev/urandom", "of={0}".format(filename),
+                               "bs=1M", "conv=fdatasync",
+                               "count={0}".format(int(n_mb)),
+                               "seek={0}".format(int(seek))
+                               ], wait=wait)
+
+    def write_test_pattern(self, filename, size):
+        log.info("Writing {0} bytes to {1}".format(size, filename))
+        return self.run_python(dedent("""
+            import zlib
+            path = "{path}"
+            with open(path, 'w') as f:
+                for i in range(0, {size}):
+                    val = zlib.crc32(str(i).encode('utf-8')) & 7
+                    f.write(chr(val))
+        """.format(
+            path=os.path.join(self.hostfs_mntpt, filename),
+            size=size
+        )))
+
+    def validate_test_pattern(self, filename, size):
+        log.info("Validating {0} bytes from {1}".format(size, filename))
+        # Use sudo because cephfs-data-scan may recreate the file with owner==root
+        return self.run_python(dedent("""
+            import zlib
+            path = "{path}"
+            with open(path, 'r') as f:
+                bytes = f.read()
+            if len(bytes) != {size}:
+                raise RuntimeError("Bad length {{0}} vs. expected {{1}}".format(
+                    len(bytes), {size}
+                ))
+            for i, b in enumerate(bytes):
+                val = zlib.crc32(str(i).encode('utf-8')) & 7
+                if b != chr(val):
+                    raise RuntimeError("Bad data at offset {{0}}".format(i))
+        """.format(
+            path=os.path.join(self.hostfs_mntpt, filename),
+            size=size
+        )), sudo=True)
+
+    def open_n_background(self, fs_path, count):
+        """
+        Open N files for writing, hold them open in a background process
+
+        :param fs_path: Path relative to CephFS root, e.g. "foo/bar"
+        :return: a RemoteProcess
+        """
+        assert(self.is_mounted())
+
+        abs_path = os.path.join(self.hostfs_mntpt, fs_path)
+
+        pyscript = dedent("""
+            import sys
+            import time
+            import os
+
+            n = {count}
+            abs_path = "{abs_path}"
+
+            if not os.path.exists(abs_path):
+                os.makedirs(abs_path)
+
+            handles = []
+            for i in range(0, n):
+                fname = "file_"+str(i)
+                path = os.path.join(abs_path, fname)
+                handles.append(open(path, 'w'))
+
+            while True:
+                time.sleep(1)
+            """).format(abs_path=abs_path, count=count)
+
+        rproc = self._run_python(pyscript)
+        self.background_procs.append(rproc)
+        return rproc
+
+    def create_n_files(self, fs_path, count, sync=False, dirsync=False, unlink=False, finaldirsync=False):
+        """
+        Create n files.
+
+        :param sync: sync the file after writing
+        :param dirsync: sync the containing directory after closing the file
+        :param unlink: unlink the file after closing
+        :param finaldirsync: sync the containing directory after closing the last file
+        """
+
+        assert(self.is_mounted())
+
+        abs_path = os.path.join(self.hostfs_mntpt, fs_path)
+
+        pyscript = dedent(f"""
+            import os
+
+            n = {count}
+            path = "{abs_path}"
+
+            dpath = os.path.dirname(path)
+            fnameprefix = os.path.basename(path)
+            os.makedirs(dpath, exist_ok=True)
+
+            try:
+                dirfd = os.open(dpath, os.O_DIRECTORY)
+
+                for i in range(n):
+                    fpath = os.path.join(dpath, f"{{fnameprefix}}_{{i}}")
+                    with open(fpath, 'w') as f:
+                        f.write(f"{{i}}")
+                        if {sync}:
+                            f.flush()
+                            os.fsync(f.fileno())
+                    if {unlink}:
+                        os.unlink(fpath)
+                    if {dirsync}:
+                        os.fsync(dirfd)
+                if {finaldirsync}:
+                    os.fsync(dirfd)
+            finally:
+                os.close(dirfd)
+            """)
+
+        self.run_python(pyscript)
+
+    def teardown(self):
+        for p in self.background_procs:
+            log.info("Terminating background process")
+            self._kill_background(p)
+
+        self.background_procs = []
+
+    def _kill_background(self, p):
+        if p.stdin:
+            p.stdin.close()
+            try:
+                p.wait()
+            except (CommandFailedError, ConnectionLostError):
+                pass
+
+    def kill_background(self, p):
+        """
+        For a process that was returned by one of the _background member functions,
+        kill it hard.
+        """
+        self._kill_background(p)
+        self.background_procs.remove(p)
+
+    def send_signal(self, signal):
+        signal = signal.lower()
+        if signal.lower() not in ['sigstop', 'sigcont', 'sigterm', 'sigkill']:
+            raise NotImplementedError
+
+        self.client_remote.run(args=['sudo', 'kill', '-{0}'.format(signal),
+                                self.client_pid], omit_sudo=False)
+
+    def get_global_id(self):
+        raise NotImplementedError()
+
+    def get_global_inst(self):
+        raise NotImplementedError()
+
+    def get_global_addr(self):
+        raise NotImplementedError()
+
+    def get_osd_epoch(self):
+        raise NotImplementedError()
+
+    def get_op_read_count(self):
+        raise NotImplementedError()
+
+    def lstat(self, fs_path, follow_symlinks=False, wait=True):
+        return self.stat(fs_path, follow_symlinks=False, wait=True)
+
+    def stat(self, fs_path, follow_symlinks=True, wait=True, **kwargs):
+        """
+        stat a file, and return the result as a dictionary like this:
+        {
+          "st_ctime": 1414161137.0,
+          "st_mtime": 1414161137.0,
+          "st_nlink": 33,
+          "st_gid": 0,
+          "st_dev": 16777218,
+          "st_size": 1190,
+          "st_ino": 2,
+          "st_uid": 0,
+          "st_mode": 16877,
+          "st_atime": 1431520593.0
+        }
+
+        Raises exception on absent file.
+        """
+        abs_path = os.path.join(self.hostfs_mntpt, fs_path)
+        if follow_symlinks:
+            stat_call = "os.stat('" + abs_path + "')"
+        else:
+            stat_call = "os.lstat('" + abs_path + "')"
+
+        pyscript = dedent("""
+            import os
+            import stat
+            import json
+            import sys
+
+            try:
+                s = {stat_call}
+            except OSError as e:
+                sys.exit(e.errno)
+
+            attrs = ["st_mode", "st_ino", "st_dev", "st_nlink", "st_uid", "st_gid", "st_size", "st_atime", "st_mtime", "st_ctime"]
+            print(json.dumps(
+                dict([(a, getattr(s, a)) for a in attrs]),
+                indent=2))
+            """).format(stat_call=stat_call)
+        proc = self._run_python(pyscript, **kwargs)
+        if wait:
+            proc.wait()
+            return json.loads(proc.stdout.getvalue().strip())
+        else:
+            return proc
+
+    def touch(self, fs_path):
+        """
+        Create a dentry if it doesn't already exist.  This python
+        implementation exists because the usual command line tool doesn't
+        pass through error codes like EIO.
+
+        :param fs_path:
+        :return:
+        """
+        abs_path = os.path.join(self.hostfs_mntpt, fs_path)
+        pyscript = dedent("""
+            import sys
+            import errno
+
+            try:
+                f = open("{path}", "w")
+                f.close()
+            except IOError as e:
+                sys.exit(errno.EIO)
+            """).format(path=abs_path)
+        proc = self._run_python(pyscript)
+        proc.wait()
+
+    def path_to_ino(self, fs_path, follow_symlinks=True):
+        abs_path = os.path.join(self.hostfs_mntpt, fs_path)
+
+        if follow_symlinks:
+            pyscript = dedent("""
+                import os
+                import stat
+
+                print(os.stat("{path}").st_ino)
+                """).format(path=abs_path)
+        else:
+            pyscript = dedent("""
+                import os
+                import stat
+
+                print(os.lstat("{path}").st_ino)
+                """).format(path=abs_path)
+
+        proc = self._run_python(pyscript)
+        proc.wait()
+        return int(proc.stdout.getvalue().strip())
+
+    def path_to_nlink(self, fs_path):
+        abs_path = os.path.join(self.hostfs_mntpt, fs_path)
+
+        pyscript = dedent("""
+            import os
+            import stat
+
+            print(os.stat("{path}").st_nlink)
+            """).format(path=abs_path)
+
+        proc = self._run_python(pyscript)
+        proc.wait()
+        return int(proc.stdout.getvalue().strip())
+
+    def ls(self, path=None, **kwargs):
+        """
+        Wrap ls: return a list of strings
+        """
+        cmd = ["ls"]
+        if path:
+            cmd.append(path)
+
+        ls_text = self.run_shell(cmd, **kwargs).stdout.getvalue().strip()
+
+        if ls_text:
+            return ls_text.split("\n")
+        else:
+            # Special case because otherwise split on empty string
+            # gives you [''] instead of []
+            return []
+
+    def setfattr(self, path, key, val, **kwargs):
+        """
+        Wrap setfattr.
+
+        :param path: relative to mount point
+        :param key: xattr name
+        :param val: xattr value
+        :return: None
+        """
+        self.run_shell(["setfattr", "-n", key, "-v", val, path], **kwargs)
+
+    def getfattr(self, path, attr, **kwargs):
+        """
+        Wrap getfattr: return the values of a named xattr on one file, or
+        None if the attribute is not found.
+
+        :return: a string
+        """
+        p = self.run_shell(["getfattr", "--only-values", "-n", attr, path], wait=False, **kwargs)
+        try:
+            p.wait()
+        except CommandFailedError as e:
+            if e.exitstatus == 1 and "No such attribute" in p.stderr.getvalue():
+                return None
+            else:
+                raise
+
+        return str(p.stdout.getvalue())
+
+    def df(self):
+        """
+        Wrap df: return a dict of usage fields in bytes
+        """
+
+        p = self.run_shell(["df", "-B1", "."])
+        lines = p.stdout.getvalue().strip().split("\n")
+        fs, total, used, avail = lines[1].split()[:4]
+        log.warning(lines)
+
+        return {
+            "total": int(total),
+            "used": int(used),
+            "available": int(avail)
+        }
+
+    def dir_checksum(self, path=None, follow_symlinks=False):
+        cmd = ["find"]
+        if follow_symlinks:
+            cmd.append("-L")
+        if path:
+            cmd.append(path)
+        cmd.extend(["-type", "f", "-exec", "md5sum", "{}", "+"])
+        checksum_text = self.run_shell(cmd).stdout.getvalue().strip()
+        checksum_sorted = sorted(checksum_text.split('\n'), key=lambda v: v.split()[1])
+        return hashlib.md5(('\n'.join(checksum_sorted)).encode('utf-8')).hexdigest()
diff --git a/qa/tasks/cephfs/test_acls.py b/qa/tasks/cephfs/test_acls.py
new file mode 100644
index 000000000..4f704c076
--- /dev/null
+++ b/qa/tasks/cephfs/test_acls.py
@@ -0,0 +1,27 @@
+import logging
+
+from io import BytesIO
+from tasks.cephfs.xfstests_dev import XFSTestsDev
+
+log = logging.getLogger(__name__)
+
+class TestACLs(XFSTestsDev):
+
+    def test_acls(self):
+        from tasks.cephfs.fuse_mount import FuseMount
+        from tasks.cephfs.kernel_mount import KernelMount
+
+        # TODO: make xfstests-dev compatible with ceph-fuse. xfstests-dev
+        # remounts CephFS before running tests using kernel, so ceph-fuse
+        # mounts are never actually testsed.
+        if isinstance(self.mount_a, FuseMount):
+            log.info('client is fuse mounted')
+            self.skipTest('Requires kernel client; xfstests-dev not '\
+                          'compatible with ceph-fuse ATM.')
+        elif isinstance(self.mount_a, KernelMount):
+            log.info('client is kernel mounted')
+
+        self.mount_a.client_remote.run(args=['sudo', './check',
+            'generic/099'], cwd=self.repo_path, stdout=BytesIO(),
+            stderr=BytesIO(), timeout=30, check_status=True,
+            label='running tests for ACLs from xfstests-dev')
diff --git a/qa/tasks/cephfs/test_admin.py b/qa/tasks/cephfs/test_admin.py
new file mode 100644
index 000000000..bbf069066
--- /dev/null
+++ b/qa/tasks/cephfs/test_admin.py
@@ -0,0 +1,912 @@
+import errno
+import json
+import logging
+import time
+import uuid
+from io import StringIO
+from os.path import join as os_path_join
+
+from teuthology.orchestra.run import CommandFailedError, Raw
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from tasks.cephfs.filesystem import FileLayout, FSMissing
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.caps_helper import CapsHelper
+
+log = logging.getLogger(__name__)
+
+class TestAdminCommands(CephFSTestCase):
+    """
+    Tests for administration command.
+    """
+
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 3
+
+    def test_fsnames_can_only_by_goodchars(self):
+        n = 'test_fsnames_can_only_by_goodchars'
+        metapoolname, datapoolname = n+'-testmetapool', n+'-testdatapool'
+        badname = n+'badname@#'
+
+        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
+                                            n+metapoolname)
+        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
+                                            n+datapoolname)
+
+        # test that fsname not with "goodchars" fails
+        args = ['fs', 'new', badname, metapoolname, datapoolname]
+        proc = self.fs.mon_manager.run_cluster_cmd(args=args,stderr=StringIO(),
+                                                   check_status=False)
+        self.assertIn('invalid chars', proc.stderr.getvalue().lower())
+
+        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'rm', metapoolname,
+                                            metapoolname,
+                                            '--yes-i-really-really-mean-it-not-faking')
+        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'rm', datapoolname,
+                                            datapoolname,
+                                            '--yes-i-really-really-mean-it-not-faking')
+
+    def test_fs_status(self):
+        """
+        That `ceph fs status` command functions.
+        """
+
+        s = self.fs.mon_manager.raw_cluster_cmd("fs", "status")
+        self.assertTrue("active" in s)
+
+        mdsmap = json.loads(self.fs.mon_manager.raw_cluster_cmd("fs", "status", "--format=json-pretty"))["mdsmap"]
+        self.assertEqual(mdsmap[0]["state"], "active")
+
+        mdsmap = json.loads(self.fs.mon_manager.raw_cluster_cmd("fs", "status", "--format=json"))["mdsmap"]
+        self.assertEqual(mdsmap[0]["state"], "active")
+
+    def _setup_ec_pools(self, n, metadata=True, overwrites=True):
+        if metadata:
+            self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', n+"-meta", "8")
+        cmd = ['osd', 'erasure-code-profile', 'set', n+"-profile", "m=2", "k=2", "crush-failure-domain=osd"]
+        self.fs.mon_manager.raw_cluster_cmd(*cmd)
+        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', n+"-data", "8", "erasure", n+"-profile")
+        if overwrites:
+            self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'set', n+"-data", 'allow_ec_overwrites', 'true')
+
+    def _check_pool_application_metadata_key_value(self, pool, app, key, value):
+        output = self.fs.mon_manager.raw_cluster_cmd(
+            'osd', 'pool', 'application', 'get', pool, app, key)
+        self.assertEqual(str(output.strip()), value)
+
+    def test_add_data_pool_root(self):
+        """
+        That a new data pool can be added and used for the root directory.
+        """
+
+        p = self.fs.add_data_pool("foo")
+        self.fs.set_dir_layout(self.mount_a, ".", FileLayout(pool=p))
+
+    def test_add_data_pool_application_metadata(self):
+        """
+        That the application metadata set on a newly added data pool is as expected.
+        """
+        pool_name = "foo"
+        mon_cmd = self.fs.mon_manager.raw_cluster_cmd
+        mon_cmd('osd', 'pool', 'create', pool_name, '--pg_num_min',
+                str(self.fs.pg_num_min))
+        # Check whether https://tracker.ceph.com/issues/43061 is fixed
+        mon_cmd('osd', 'pool', 'application', 'enable', pool_name, 'cephfs')
+        self.fs.add_data_pool(pool_name, create=False)
+        self._check_pool_application_metadata_key_value(
+            pool_name, 'cephfs', 'data', self.fs.name)
+
+    def test_add_data_pool_subdir(self):
+        """
+        That a new data pool can be added and used for a sub-directory.
+        """
+
+        p = self.fs.add_data_pool("foo")
+        self.mount_a.run_shell("mkdir subdir")
+        self.fs.set_dir_layout(self.mount_a, "subdir", FileLayout(pool=p))
+
+    def test_add_data_pool_non_alphamueric_name_as_subdir(self):
+        """
+        That a new data pool with non-alphanumeric name can be added and used for a sub-directory.
+        """
+        p = self.fs.add_data_pool("I-am-data_pool00.")
+        self.mount_a.run_shell("mkdir subdir")
+        self.fs.set_dir_layout(self.mount_a, "subdir", FileLayout(pool=p))
+
+    def test_add_data_pool_ec(self):
+        """
+        That a new EC data pool can be added.
+        """
+
+        n = "test_add_data_pool_ec"
+        self._setup_ec_pools(n, metadata=False)
+        self.fs.add_data_pool(n+"-data", create=False)
+
+    def test_new_default_ec(self):
+        """
+        That a new file system warns/fails with an EC default data pool.
+        """
+
+        self.mount_a.umount_wait(require_clean=True)
+        self.mds_cluster.delete_all_filesystems()
+        n = "test_new_default_ec"
+        self._setup_ec_pools(n)
+        try:
+            self.fs.mon_manager.raw_cluster_cmd('fs', 'new', n, n+"-meta", n+"-data")
+        except CommandFailedError as e:
+            if e.exitstatus == 22:
+                pass
+            else:
+                raise
+        else:
+            raise RuntimeError("expected failure")
+
+    def test_new_default_ec_force(self):
+        """
+        That a new file system succeeds with an EC default data pool with --force.
+        """
+
+        self.mount_a.umount_wait(require_clean=True)
+        self.mds_cluster.delete_all_filesystems()
+        n = "test_new_default_ec_force"
+        self._setup_ec_pools(n)
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'new', n, n+"-meta", n+"-data", "--force")
+
+    def test_new_default_ec_no_overwrite(self):
+        """
+        That a new file system fails with an EC default data pool without overwrite.
+        """
+
+        self.mount_a.umount_wait(require_clean=True)
+        self.mds_cluster.delete_all_filesystems()
+        n = "test_new_default_ec_no_overwrite"
+        self._setup_ec_pools(n, overwrites=False)
+        try:
+            self.fs.mon_manager.raw_cluster_cmd('fs', 'new', n, n+"-meta", n+"-data")
+        except CommandFailedError as e:
+            if e.exitstatus == 22:
+                pass
+            else:
+                raise
+        else:
+            raise RuntimeError("expected failure")
+        # and even with --force !
+        try:
+            self.fs.mon_manager.raw_cluster_cmd('fs', 'new', n, n+"-meta", n+"-data", "--force")
+        except CommandFailedError as e:
+            if e.exitstatus == 22:
+                pass
+            else:
+                raise
+        else:
+            raise RuntimeError("expected failure")
+
+    def test_fs_new_pool_application_metadata(self):
+        """
+        That the application metadata set on the pools of a newly created filesystem are as expected.
+        """
+        self.mount_a.umount_wait(require_clean=True)
+        self.mds_cluster.delete_all_filesystems()
+        fs_name = "test_fs_new_pool_application"
+        keys = ['metadata', 'data']
+        pool_names = [fs_name+'-'+key for key in keys]
+        mon_cmd = self.fs.mon_manager.raw_cluster_cmd
+        for p in pool_names:
+            mon_cmd('osd', 'pool', 'create', p, '--pg_num_min', str(self.fs.pg_num_min))
+            mon_cmd('osd', 'pool', 'application', 'enable', p, 'cephfs')
+        mon_cmd('fs', 'new', fs_name, pool_names[0], pool_names[1])
+        for i in range(2):
+            self._check_pool_application_metadata_key_value(
+                pool_names[i], 'cephfs', keys[i], fs_name)
+
+    def test_fs_new_with_specific_id(self):
+        """
+        That a file system can be created with a specific ID.
+        """
+        fs_name = "test_fs_specific_id"
+        fscid = 100
+        keys = ['metadata', 'data']
+        pool_names = [fs_name+'-'+key for key in keys]
+        for p in pool_names:
+            self.run_cluster_cmd(f'osd pool create {p}')
+        self.run_cluster_cmd(f'fs new {fs_name} {pool_names[0]} {pool_names[1]} --fscid  {fscid} --force')
+        self.fs.status().get_fsmap(fscid)
+        for i in range(2):
+            self._check_pool_application_metadata_key_value(pool_names[i], 'cephfs', keys[i], fs_name)
+
+    def test_fs_new_with_specific_id_idempotency(self):
+        """
+        That command to create file system with specific ID is idempotent.
+        """
+        fs_name = "test_fs_specific_id"
+        fscid = 100
+        keys = ['metadata', 'data']
+        pool_names = [fs_name+'-'+key for key in keys]
+        for p in pool_names:
+            self.run_cluster_cmd(f'osd pool create {p}')
+        self.run_cluster_cmd(f'fs new {fs_name} {pool_names[0]} {pool_names[1]} --fscid  {fscid} --force')
+        self.run_cluster_cmd(f'fs new {fs_name} {pool_names[0]} {pool_names[1]} --fscid  {fscid} --force')
+        self.fs.status().get_fsmap(fscid)
+
+    def test_fs_new_with_specific_id_fails_without_force_flag(self):
+        """
+        That command to create file system with specific ID fails without '--force' flag.
+        """
+        fs_name = "test_fs_specific_id"
+        fscid = 100
+        keys = ['metadata', 'data']
+        pool_names = [fs_name+'-'+key for key in keys]
+        for p in pool_names:
+            self.run_cluster_cmd(f'osd pool create {p}')
+        try:
+            self.run_cluster_cmd(f'fs new {fs_name} {pool_names[0]} {pool_names[1]} --fscid  {fscid}')
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL,
+                "invalid error code on creating a file system with specifc ID without --force flag")
+        else:
+            self.fail("expected creating file system with specific ID without '--force' flag to fail")
+
+    def test_fs_new_with_specific_id_fails_already_in_use(self):
+        """
+        That creating file system with ID already in use fails.
+        """
+        fs_name = "test_fs_specific_id"
+        # file system ID already in use
+        fscid =  self.fs.status().map['filesystems'][0]['id']
+        keys = ['metadata', 'data']
+        pool_names = [fs_name+'-'+key for key in keys]
+        for p in pool_names:
+            self.run_cluster_cmd(f'osd pool create {p}')
+        try:
+            self.run_cluster_cmd(f'fs new {fs_name} {pool_names[0]} {pool_names[1]} --fscid  {fscid} --force')
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL,
+                "invalid error code on creating a file system with specifc ID that is already in use")
+        else:
+            self.fail("expected creating file system with ID already in use to fail")
+
+
+class TestDump(CephFSTestCase):
+    CLIENTS_REQUIRED = 0
+    MDSS_REQUIRED = 1
+
+    def test_fs_dump_epoch(self):
+        """
+        That dumping a specific epoch works.
+        """
+
+        status1 = self.fs.status()
+        status2 = self.fs.status(epoch=status1["epoch"]-1)
+        self.assertEqual(status1["epoch"], status2["epoch"]+1)
+
+    def test_fsmap_trim(self):
+        """
+        That the fsmap is trimmed normally.
+        """
+
+        paxos_service_trim_min = 25
+        self.config_set('mon', 'paxos_service_trim_min', paxos_service_trim_min)
+        mon_max_mdsmap_epochs = 20
+        self.config_set('mon', 'mon_max_mdsmap_epochs', mon_max_mdsmap_epochs)
+
+        status = self.fs.status()
+        epoch = status["epoch"]
+
+        # for N mutations
+        mutations = paxos_service_trim_min + mon_max_mdsmap_epochs
+        b = False
+        for i in range(mutations):
+            self.fs.set_joinable(b)
+            b = not b
+
+        time.sleep(10) # for tick/compaction
+
+        try:
+            self.fs.status(epoch=epoch)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.ENOENT, "invalid error code when trying to fetch FSMap that was trimmed")
+        else:
+            self.fail("trimming did not occur as expected")
+
+    def test_fsmap_force_trim(self):
+        """
+        That the fsmap is trimmed forcefully.
+        """
+
+        status = self.fs.status()
+        epoch = status["epoch"]
+
+        paxos_service_trim_min = 1
+        self.config_set('mon', 'paxos_service_trim_min', paxos_service_trim_min)
+        mon_mds_force_trim_to = epoch+1
+        self.config_set('mon', 'mon_mds_force_trim_to', mon_mds_force_trim_to)
+
+        # force a new fsmap
+        self.fs.set_joinable(False)
+        time.sleep(10) # for tick/compaction
+
+        status = self.fs.status()
+        log.debug(f"new epoch is {status['epoch']}")
+        self.fs.status(epoch=epoch+1) # epoch+1 is not trimmed, may not == status["epoch"]
+
+        try:
+            self.fs.status(epoch=epoch)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.ENOENT, "invalid error code when trying to fetch FSMap that was trimmed")
+        else:
+            self.fail("trimming did not occur as expected")
+
+class TestRequiredClientFeatures(CephFSTestCase):
+    CLIENTS_REQUIRED = 0
+    MDSS_REQUIRED = 1
+
+    def test_required_client_features(self):
+        """
+        That `ceph fs required_client_features` command functions.
+        """
+
+        def is_required(index):
+            out = self.fs.mon_manager.raw_cluster_cmd('fs', 'get', self.fs.name, '--format=json-pretty')
+            features = json.loads(out)['mdsmap']['required_client_features']
+            if "feature_{0}".format(index) in features:
+                return True;
+            return False;
+
+        features = json.loads(self.fs.mon_manager.raw_cluster_cmd('fs', 'feature', 'ls', '--format=json-pretty'))
+        self.assertGreater(len(features), 0);
+
+        for f in features:
+            self.fs.required_client_features('rm', str(f['index']))
+
+        for f in features:
+            index = f['index']
+            feature = f['name']
+            if feature == 'reserved':
+                feature = str(index)
+
+            if index % 3 == 0:
+                continue;
+            self.fs.required_client_features('add', feature)
+            self.assertTrue(is_required(index))
+
+            if index % 2 == 0:
+                continue;
+            self.fs.required_client_features('rm', feature)
+            self.assertFalse(is_required(index))
+
+    def test_required_client_feature_add_reserved(self):
+        """
+        That `ceph fs required_client_features X add reserved` fails.
+        """
+
+        p = self.fs.required_client_features('add', 'reserved', check_status=False, stderr=StringIO())
+        self.assertIn('Invalid feature name', p.stderr.getvalue())
+
+    def test_required_client_feature_rm_reserved(self):
+        """
+        That `ceph fs required_client_features X rm reserved` fails.
+        """
+
+        p = self.fs.required_client_features('rm', 'reserved', check_status=False, stderr=StringIO())
+        self.assertIn('Invalid feature name', p.stderr.getvalue())
+
+    def test_required_client_feature_add_reserved_bit(self):
+        """
+        That `ceph fs required_client_features X add <reserved_bit>` passes.
+        """
+
+        p = self.fs.required_client_features('add', '1', stderr=StringIO())
+        self.assertIn("added feature 'reserved' to required_client_features", p.stderr.getvalue())
+
+    def test_required_client_feature_rm_reserved_bit(self):
+        """
+        That `ceph fs required_client_features X rm <reserved_bit>` passes.
+        """
+
+        self.fs.required_client_features('add', '1')
+        p = self.fs.required_client_features('rm', '1', stderr=StringIO())
+        self.assertIn("removed feature 'reserved' from required_client_features", p.stderr.getvalue())
+
+class TestCompatCommands(CephFSTestCase):
+    """
+    """
+
+    CLIENTS_REQUIRED = 0
+    MDSS_REQUIRED = 3
+
+    def test_add_compat(self):
+        """
+        Test adding a compat.
+        """
+
+        self.fs.fail()
+        self.fs.add_compat(63, 'placeholder')
+        mdsmap = self.fs.get_mds_map()
+        self.assertIn("feature_63", mdsmap['compat']['compat'])
+
+    def test_add_incompat(self):
+        """
+        Test adding an incompat.
+        """
+
+        self.fs.fail()
+        self.fs.add_incompat(63, 'placeholder')
+        mdsmap = self.fs.get_mds_map()
+        log.info(f"{mdsmap}")
+        self.assertIn("feature_63", mdsmap['compat']['incompat'])
+
+    def test_rm_compat(self):
+        """
+        Test removing a compat.
+        """
+
+        self.fs.fail()
+        self.fs.add_compat(63, 'placeholder')
+        self.fs.rm_compat(63)
+        mdsmap = self.fs.get_mds_map()
+        self.assertNotIn("feature_63", mdsmap['compat']['compat'])
+
+    def test_rm_incompat(self):
+        """
+        Test removing an incompat.
+        """
+
+        self.fs.fail()
+        self.fs.add_incompat(63, 'placeholder')
+        self.fs.rm_incompat(63)
+        mdsmap = self.fs.get_mds_map()
+        self.assertNotIn("feature_63", mdsmap['compat']['incompat'])
+
+    def test_standby_compat(self):
+        """
+        That adding a compat does not prevent standbys from joining.
+        """
+
+        self.fs.fail()
+        self.fs.add_compat(63, "placeholder")
+        self.fs.set_joinable()
+        self.fs.wait_for_daemons()
+        mdsmap = self.fs.get_mds_map()
+        self.assertIn("feature_63", mdsmap['compat']['compat'])
+
+    def test_standby_incompat_reject(self):
+        """
+        That adding an incompat feature prevents incompatible daemons from joining.
+        """
+
+        self.fs.fail()
+        self.fs.add_incompat(63, "placeholder")
+        self.fs.set_joinable()
+        try:
+            self.fs.wait_for_daemons(timeout=60)
+        except RuntimeError as e:
+            if "Timed out waiting for MDS daemons to become healthy" in str(e):
+                pass
+            else:
+                raise
+        else:
+            self.fail()
+
+    def test_standby_incompat_upgrade(self):
+        """
+        That an MDS can upgrade the compat of a fs.
+        """
+
+        self.fs.fail()
+        self.fs.rm_incompat(1)
+        self.fs.set_joinable()
+        self.fs.wait_for_daemons()
+        mdsmap = self.fs.get_mds_map()
+        self.assertIn("feature_1", mdsmap['compat']['incompat'])
+
+    def test_standby_replay_not_upgradeable(self):
+        """
+        That the mons will not upgrade the MDSMap compat if standby-replay is
+        enabled.
+        """
+
+        self.fs.fail()
+        self.fs.rm_incompat(1)
+        self.fs.set_allow_standby_replay(True)
+        self.fs.set_joinable()
+        try:
+            self.fs.wait_for_daemons(timeout=60)
+        except RuntimeError as e:
+            if "Timed out waiting for MDS daemons to become healthy" in str(e):
+                pass
+            else:
+                raise
+        else:
+            self.fail()
+
+    def test_standby_incompat_reject_multifs(self):
+        """
+        Like test_standby_incompat_reject but with a second fs.
+        """
+
+        fs2 = self.mds_cluster.newfs(name="cephfs2", create=True)
+        fs2.fail()
+        fs2.add_incompat(63, 'placeholder')
+        fs2.set_joinable()
+        try:
+            fs2.wait_for_daemons(timeout=60)
+        except RuntimeError as e:
+            if "Timed out waiting for MDS daemons to become healthy" in str(e):
+                pass
+            else:
+                raise
+        else:
+            self.fail()
+        # did self.fs lose MDS or standbys suicide?
+        self.fs.wait_for_daemons()
+        mdsmap = fs2.get_mds_map()
+        self.assertIn("feature_63", mdsmap['compat']['incompat'])
+
+class TestConfigCommands(CephFSTestCase):
+    """
+    Test that daemons and clients respond to the otherwise rarely-used
+    runtime config modification operations.
+    """
+
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 1
+
+    def test_ceph_config_show(self):
+        """
+        That I can successfully show MDS configuration.
+        """
+
+        names = self.fs.get_rank_names()
+        for n in names:
+            s = self.fs.mon_manager.raw_cluster_cmd("config", "show", "mds."+n)
+            self.assertTrue("NAME" in s)
+            self.assertTrue("mon_host" in s)
+
+
+    def test_client_config(self):
+        """
+        That I can successfully issue asok "config set" commands
+
+        :return:
+        """
+
+        if not isinstance(self.mount_a, FuseMount):
+            self.skipTest("Test only applies to FUSE clients")
+
+        test_key = "client_cache_size"
+        test_val = "123"
+        self.mount_a.admin_socket(['config', 'set', test_key, test_val])
+        out = self.mount_a.admin_socket(['config', 'get', test_key])
+        self.assertEqual(out[test_key], test_val)
+
+
+    def test_mds_config_asok(self):
+        test_key = "mds_max_purge_ops"
+        test_val = "123"
+        self.fs.mds_asok(['config', 'set', test_key, test_val])
+        out = self.fs.mds_asok(['config', 'get', test_key])
+        self.assertEqual(out[test_key], test_val)
+
+    def test_mds_config_tell(self):
+        test_key = "mds_max_purge_ops"
+        test_val = "123"
+
+        self.fs.rank_tell(['injectargs', "--{0}={1}".format(test_key, test_val)])
+
+        # Read it back with asok because there is no `tell` equivalent
+        out = self.fs.rank_tell(['config', 'get', test_key])
+        self.assertEqual(out[test_key], test_val)
+
+
+class TestMirroringCommands(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 1
+
+    def _enable_mirroring(self, fs_name):
+        self.fs.mon_manager.raw_cluster_cmd("fs", "mirror", "enable", fs_name)
+
+    def _disable_mirroring(self, fs_name):
+        self.fs.mon_manager.raw_cluster_cmd("fs", "mirror", "disable", fs_name)
+
+    def _add_peer(self, fs_name, peer_spec, remote_fs_name):
+        peer_uuid = str(uuid.uuid4())
+        self.fs.mon_manager.raw_cluster_cmd("fs", "mirror", "peer_add", fs_name, peer_uuid, peer_spec, remote_fs_name)
+
+    def _remove_peer(self, fs_name, peer_uuid):
+        self.fs.mon_manager.raw_cluster_cmd("fs", "mirror", "peer_remove", fs_name, peer_uuid)
+
+    def _verify_mirroring(self, fs_name, flag_str):
+        status = self.fs.status()
+        fs_map = status.get_fsmap_byname(fs_name)
+        if flag_str == 'enabled':
+            self.assertTrue('mirror_info' in fs_map)
+        elif flag_str == 'disabled':
+            self.assertTrue('mirror_info' not in fs_map)
+        else:
+            raise RuntimeError(f'invalid flag_str {flag_str}')
+
+    def _get_peer_uuid(self, fs_name, peer_spec):
+        status = self.fs.status()
+        fs_map = status.get_fsmap_byname(fs_name)
+        mirror_info = fs_map.get('mirror_info', None)
+        self.assertTrue(mirror_info is not None)
+        for peer_uuid, remote in mirror_info['peers'].items():
+            client_name = remote['remote']['client_name']
+            cluster_name = remote['remote']['cluster_name']
+            spec = f'{client_name}@{cluster_name}'
+            if spec == peer_spec:
+                return peer_uuid
+        return None
+
+    def test_mirroring_command(self):
+        """basic mirroring command test -- enable, disable mirroring on a
+        filesystem"""
+        self._enable_mirroring(self.fs.name)
+        self._verify_mirroring(self.fs.name, "enabled")
+        self._disable_mirroring(self.fs.name)
+        self._verify_mirroring(self.fs.name, "disabled")
+
+    def test_mirroring_peer_commands(self):
+        """test adding and removing peers to a mirror enabled filesystem"""
+        self._enable_mirroring(self.fs.name)
+        self._add_peer(self.fs.name, "client.site-b@site-b", "fs_b")
+        self._add_peer(self.fs.name, "client.site-c@site-c", "fs_c")
+        self._verify_mirroring(self.fs.name, "enabled")
+        uuid_peer_b = self._get_peer_uuid(self.fs.name, "client.site-b@site-b")
+        uuid_peer_c = self._get_peer_uuid(self.fs.name, "client.site-c@site-c")
+        self.assertTrue(uuid_peer_b is not None)
+        self.assertTrue(uuid_peer_c is not None)
+        self._remove_peer(self.fs.name, uuid_peer_b)
+        self._remove_peer(self.fs.name, uuid_peer_c)
+        self._disable_mirroring(self.fs.name)
+        self._verify_mirroring(self.fs.name, "disabled")
+
+    def test_mirroring_command_idempotency(self):
+        """test to check idempotency of mirroring family of commands """
+        self._enable_mirroring(self.fs.name)
+        self._verify_mirroring(self.fs.name, "enabled")
+        self._enable_mirroring(self.fs.name)
+        # add peer
+        self._add_peer(self.fs.name, "client.site-b@site-b", "fs_b")
+        uuid_peer_b1 = self._get_peer_uuid(self.fs.name, "client.site-b@site-b")
+        self.assertTrue(uuid_peer_b1 is not None)
+        # adding the peer again should be idempotent
+        self._add_peer(self.fs.name, "client.site-b@site-b", "fs_b")
+        uuid_peer_b2 = self._get_peer_uuid(self.fs.name, "client.site-b@site-b")
+        self.assertTrue(uuid_peer_b2 is not None)
+        self.assertTrue(uuid_peer_b1 == uuid_peer_b2)
+        # remove peer
+        self._remove_peer(self.fs.name, uuid_peer_b1)
+        uuid_peer_b3 = self._get_peer_uuid(self.fs.name, "client.site-b@site-b")
+        self.assertTrue(uuid_peer_b3 is None)
+        # removing the peer again should be idempotent
+        self._remove_peer(self.fs.name, uuid_peer_b1)
+        self._disable_mirroring(self.fs.name)
+        self._verify_mirroring(self.fs.name, "disabled")
+        self._disable_mirroring(self.fs.name)
+
+    def test_mirroring_disable_with_peers(self):
+        """test disabling mirroring for a filesystem with active peers"""
+        self._enable_mirroring(self.fs.name)
+        self._add_peer(self.fs.name, "client.site-b@site-b", "fs_b")
+        self._verify_mirroring(self.fs.name, "enabled")
+        uuid_peer_b = self._get_peer_uuid(self.fs.name, "client.site-b@site-b")
+        self.assertTrue(uuid_peer_b is not None)
+        self._disable_mirroring(self.fs.name)
+        self._verify_mirroring(self.fs.name, "disabled")
+        # enable mirroring to check old peers
+        self._enable_mirroring(self.fs.name)
+        self._verify_mirroring(self.fs.name, "enabled")
+        # peer should be gone
+        uuid_peer_b = self._get_peer_uuid(self.fs.name, "client.site-b@site-b")
+        self.assertTrue(uuid_peer_b is None)
+        self._disable_mirroring(self.fs.name)
+        self._verify_mirroring(self.fs.name, "disabled")
+
+    def test_mirroring_with_filesystem_reset(self):
+        """test to verify mirroring state post filesystem reset"""
+        self._enable_mirroring(self.fs.name)
+        self._add_peer(self.fs.name, "client.site-b@site-b", "fs_b")
+        self._verify_mirroring(self.fs.name, "enabled")
+        uuid_peer_b = self._get_peer_uuid(self.fs.name, "client.site-b@site-b")
+        self.assertTrue(uuid_peer_b is not None)
+        # reset filesystem
+        self.fs.fail()
+        self.fs.reset()
+        self.fs.wait_for_daemons()
+        self._verify_mirroring(self.fs.name, "disabled")
+
+
+class TestSubCmdFsAuthorize(CapsHelper):
+    client_id = 'testuser'
+    client_name = 'client.' + client_id
+
+    def test_single_path_r(self):
+        perm = 'r'
+        filepaths, filedata, mounts, keyring = self.setup_test_env(perm)
+        moncap = self.get_mon_cap_from_keyring(self.client_name)
+
+        self.run_mon_cap_tests(moncap, keyring)
+        self.run_mds_cap_tests(filepaths, filedata, mounts, perm)
+
+    def test_single_path_rw(self):
+        perm = 'rw'
+        filepaths, filedata, mounts, keyring = self.setup_test_env(perm)
+        moncap = self.get_mon_cap_from_keyring(self.client_name)
+
+        self.run_mon_cap_tests(moncap, keyring)
+        self.run_mds_cap_tests(filepaths, filedata, mounts, perm)
+
+    def test_single_path_rootsquash(self):
+        filedata, filename = 'some data on fs 1', 'file_on_fs1'
+        filepath = os_path_join(self.mount_a.hostfs_mntpt, filename)
+        self.mount_a.write_file(filepath, filedata)
+
+        keyring = self.fs.authorize(self.client_id, ('/', 'rw', 'root_squash'))
+        keyring_path = self.create_keyring_file(self.mount_a.client_remote,
+                                                keyring)
+        self.mount_a.remount(client_id=self.client_id,
+                             client_keyring_path=keyring_path,
+                             cephfs_mntpt='/')
+
+        if filepath.find(self.mount_a.hostfs_mntpt) != -1:
+            # can read, but not write as root
+            contents = self.mount_a.read_file(filepath)
+            self.assertEqual(filedata, contents)
+            cmdargs = ['echo', 'some random data', Raw('|'), 'sudo', 'tee', filepath]
+            self.mount_a.negtestcmd(args=cmdargs, retval=1, errmsg='permission denied')
+
+    def test_single_path_authorize_on_nonalphanumeric_fsname(self):
+        """
+        That fs authorize command works on filesystems with names having [_.-] characters
+        """
+        self.mount_a.umount_wait(require_clean=True)
+        self.mds_cluster.delete_all_filesystems()
+        fs_name = "cephfs-_."
+        self.fs = self.mds_cluster.newfs(name=fs_name)
+        self.fs.wait_for_daemons()
+        self.run_cluster_cmd(f'auth caps client.{self.mount_a.client_id} '
+                             f'mon "allow r" '
+                             f'osd "allow rw pool={self.fs.get_data_pool_name()}" '
+                             f'mds allow')
+        self.mount_a.remount(cephfs_name=self.fs.name)
+        perm = 'rw'
+        filepaths, filedata, mounts, keyring = self.setup_test_env(perm)
+        self.run_mds_cap_tests(filepaths, filedata, mounts, perm)
+
+    def test_multiple_path_r(self):
+        perm, paths = 'r', ('/dir1', '/dir2/dir22')
+        filepaths, filedata, mounts, keyring = self.setup_test_env(perm, paths)
+        moncap = self.get_mon_cap_from_keyring(self.client_name)
+
+        keyring_path = self.create_keyring_file(self.mount_a.client_remote,
+                                                keyring)
+        for path in paths:
+            self.mount_a.remount(client_id=self.client_id,
+                                 client_keyring_path=keyring_path,
+                                 cephfs_mntpt=path)
+
+
+            # actual tests...
+            self.run_mon_cap_tests(moncap, keyring)
+            self.run_mds_cap_tests(filepaths, filedata, mounts, perm)
+
+    def test_multiple_path_rw(self):
+        perm, paths = 'rw', ('/dir1', '/dir2/dir22')
+        filepaths, filedata, mounts, keyring = self.setup_test_env(perm, paths)
+        moncap = self.get_mon_cap_from_keyring(self.client_name)
+
+        keyring_path = self.create_keyring_file(self.mount_a.client_remote,
+                                                keyring)
+        for path in paths:
+            self.mount_a.remount(client_id=self.client_id,
+                                 client_keyring_path=keyring_path,
+                                 cephfs_mntpt=path)
+
+
+            # actual tests...
+            self.run_mon_cap_tests(moncap, keyring)
+            self.run_mds_cap_tests(filepaths, filedata, mounts, perm)
+
+    def tearDown(self):
+        self.mount_a.umount_wait()
+        self.run_cluster_cmd(f'auth rm {self.client_name}')
+
+        super(type(self), self).tearDown()
+
+    def setup_for_single_path(self, perm):
+        filedata, filename = 'some data on fs 1', 'file_on_fs1'
+
+        filepath = os_path_join(self.mount_a.hostfs_mntpt, filename)
+        self.mount_a.write_file(filepath, filedata)
+
+        keyring = self.fs.authorize(self.client_id, ('/', perm))
+        keyring_path = self.create_keyring_file(self.mount_a.client_remote,
+                                                keyring)
+
+        self.mount_a.remount(client_id=self.client_id,
+                             client_keyring_path=keyring_path,
+                             cephfs_mntpt='/')
+
+        return filepath, filedata, keyring
+
+    def setup_for_multiple_paths(self, perm, paths):
+        filedata, filename = 'some data on fs 1', 'file_on_fs1'
+
+        self.mount_a.run_shell('mkdir -p dir1/dir12/dir13 dir2/dir22/dir23')
+
+        filepaths = []
+        for path in paths:
+            filepath = os_path_join(self.mount_a.hostfs_mntpt, path[1:], filename)
+            self.mount_a.write_file(filepath, filedata)
+            filepaths.append(filepath.replace(path, ''))
+        filepaths = tuple(filepaths)
+
+        keyring = self.fs.authorize(self.client_id, (paths[0], perm, paths[1],
+                                                     perm))
+
+        return filepaths, filedata, keyring
+
+    def setup_test_env(self, perm, paths=()):
+        filepaths, filedata, keyring = self.setup_for_multiple_paths(perm, paths) if paths \
+            else self.setup_for_single_path(perm)
+
+        if not isinstance(filepaths, tuple):
+            filepaths = (filepaths, )
+        if not isinstance(filedata, tuple):
+            filedata = (filedata, )
+        mounts = (self.mount_a, )
+
+        return filepaths, filedata, mounts, keyring
+
+class TestAdminCommandIdempotency(CephFSTestCase):
+    """
+    Tests for administration command idempotency.
+    """
+
+    CLIENTS_REQUIRED = 0
+    MDSS_REQUIRED = 1
+
+    def test_rm_idempotency(self):
+        """
+        That a removing a fs twice is idempotent.
+        """
+
+        data_pools = self.fs.get_data_pool_names(refresh=True)
+        self.fs.fail()
+        self.fs.rm()
+        try:
+            self.fs.get_mds_map()
+        except FSMissing:
+            pass
+        else:
+            self.fail("get_mds_map should raise")
+        p = self.fs.rm()
+        self.assertIn("does not exist", p.stderr.getvalue())
+        self.fs.remove_pools(data_pools)
+
+
+class TestAdminCommandDumpTree(CephFSTestCase):
+    """
+    Tests for administration command subtrees.
+    """
+
+    CLIENTS_REQUIRED = 0
+    MDSS_REQUIRED = 1
+
+    def test_dump_subtrees(self):
+        """
+        Dump all the subtrees to make sure the MDS daemon won't crash.
+        """
+
+        subtrees = self.fs.mds_asok(['get', 'subtrees'])
+        log.info(f"dumping {len(subtrees)} subtrees:")
+        for subtree in subtrees:
+            log.info(f"  subtree: '{subtree['dir']['path']}'")
+            self.fs.mds_asok(['dump', 'tree', subtree['dir']['path']])
+
+        log.info("dumping 2 special subtrees:")
+        log.info("  subtree: '/'")
+        self.fs.mds_asok(['dump', 'tree', '/'])
+        log.info("  subtree: '~mdsdir'")
+        self.fs.mds_asok(['dump', 'tree', '~mdsdir'])
diff --git a/qa/tasks/cephfs/test_auto_repair.py b/qa/tasks/cephfs/test_auto_repair.py
new file mode 100644
index 000000000..00c86b68b
--- /dev/null
+++ b/qa/tasks/cephfs/test_auto_repair.py
@@ -0,0 +1,88 @@
+
+"""
+Exercise the MDS's auto repair functions
+"""
+
+import logging
+import time
+
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+
+log = logging.getLogger(__name__)
+
+
+# Arbitrary timeouts for operations involving restarting
+# an MDS or waiting for it to come up
+MDS_RESTART_GRACE = 60
+
+
+class TestMDSAutoRepair(CephFSTestCase):
+    def test_backtrace_repair(self):
+        """
+        MDS should verify/fix backtrace on fetch dirfrag
+        """
+
+        self.mount_a.run_shell(["mkdir", "testdir1"])
+        self.mount_a.run_shell(["touch", "testdir1/testfile"])
+        dir_objname = "{:x}.00000000".format(self.mount_a.path_to_ino("testdir1"))
+
+        # drop inodes caps
+        self.mount_a.umount_wait()
+
+        # flush journal entries to dirfrag objects, and expire journal
+        self.fs.mds_asok(['flush', 'journal'])
+
+        # Restart the MDS to drop the metadata cache (because we expired the journal,
+        # nothing gets replayed into cache on restart)
+        self.fs.rank_fail()
+        self.fs.wait_for_daemons()
+
+        # remove testdir1's backtrace
+        self.fs.radosm(["rmxattr", dir_objname, "parent"])
+
+        # readdir (fetch dirfrag) should fix testdir1's backtrace
+        self.mount_a.mount_wait()
+        self.mount_a.run_shell(["ls", "testdir1"])
+
+        # flush journal entries to dirfrag objects
+        self.fs.mds_asok(['flush', 'journal'])
+
+        # check if backtrace exists
+        self.fs.radosm(["getxattr", dir_objname, "parent"])
+
+    def test_mds_readonly(self):
+        """
+        test if MDS behave correct when it's readonly
+        """
+        # operation should successd when MDS is not readonly
+        self.mount_a.run_shell(["touch", "test_file1"])
+        writer = self.mount_a.write_background(loop=True)
+
+        time.sleep(10)
+        self.assertFalse(writer.finished)
+
+        # force MDS to read-only mode
+        self.fs.mds_asok(['force_readonly'])
+        time.sleep(10)
+
+        # touching test file should fail
+        try:
+            self.mount_a.run_shell(["touch", "test_file1"])
+        except CommandFailedError:
+            pass
+        else:
+            self.assertTrue(False)
+
+        # background writer also should fail
+        self.assertTrue(writer.finished)
+
+        # The MDS should report its readonly health state to the mon
+        self.wait_for_health("MDS_READ_ONLY", timeout=30)
+
+        # restart mds to make it writable
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        self.wait_for_health_clear(timeout=30)
diff --git a/qa/tasks/cephfs/test_backtrace.py b/qa/tasks/cephfs/test_backtrace.py
new file mode 100644
index 000000000..af246a1e3
--- /dev/null
+++ b/qa/tasks/cephfs/test_backtrace.py
@@ -0,0 +1,78 @@
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+
+class TestBacktrace(CephFSTestCase):
+    def test_backtrace(self):
+        """
+        That the 'parent' and 'layout' xattrs on the head objects of files
+        are updated correctly.
+        """
+
+        old_data_pool_name = self.fs.get_data_pool_name()
+        old_pool_id = self.fs.get_data_pool_id()
+
+        # Create a file for subsequent checks
+        self.mount_a.run_shell(["mkdir", "parent_a"])
+        self.mount_a.run_shell(["touch", "parent_a/alpha"])
+        file_ino = self.mount_a.path_to_ino("parent_a/alpha")
+
+        # That backtrace and layout are written after initial flush
+        self.fs.mds_asok(["flush", "journal"])
+        backtrace = self.fs.read_backtrace(file_ino)
+        self.assertEqual(['alpha', 'parent_a'], [a['dname'] for a in backtrace['ancestors']])
+        layout = self.fs.read_layout(file_ino)
+        self.assertDictEqual(layout, {
+            "stripe_unit": 4194304,
+            "stripe_count": 1,
+            "object_size": 4194304,
+            "pool_id": old_pool_id,
+            "pool_ns": "",
+        })
+        self.assertEqual(backtrace['pool'], old_pool_id)
+
+        # That backtrace is written after parentage changes
+        self.mount_a.run_shell(["mkdir", "parent_b"])
+        self.mount_a.run_shell(["mv", "parent_a/alpha", "parent_b/alpha"])
+
+        self.fs.mds_asok(["flush", "journal"])
+        backtrace = self.fs.read_backtrace(file_ino)
+        self.assertEqual(['alpha', 'parent_b'], [a['dname'] for a in backtrace['ancestors']])
+
+        # Create a new data pool
+        new_pool_name = "data_new"
+        new_pool_id = self.fs.add_data_pool(new_pool_name)
+
+        # That an object which has switched pools gets its backtrace updated
+        self.mount_a.setfattr("./parent_b/alpha",
+                              "ceph.file.layout.pool", new_pool_name)
+        self.fs.mds_asok(["flush", "journal"])
+        backtrace_old_pool = self.fs.read_backtrace(file_ino, pool=old_data_pool_name)
+        self.assertEqual(backtrace_old_pool['pool'], new_pool_id)
+        backtrace_new_pool = self.fs.read_backtrace(file_ino, pool=new_pool_name)
+        self.assertEqual(backtrace_new_pool['pool'], new_pool_id)
+        new_pool_layout = self.fs.read_layout(file_ino, pool=new_pool_name)
+        self.assertEqual(new_pool_layout['pool_id'], new_pool_id)
+        self.assertEqual(new_pool_layout['pool_ns'], '')
+
+        # That subsequent linkage changes are only written to new pool backtrace
+        self.mount_a.run_shell(["mkdir", "parent_c"])
+        self.mount_a.run_shell(["mv", "parent_b/alpha", "parent_c/alpha"])
+        self.fs.mds_asok(["flush", "journal"])
+        backtrace_old_pool = self.fs.read_backtrace(file_ino, pool=old_data_pool_name)
+        self.assertEqual(['alpha', 'parent_b'], [a['dname'] for a in backtrace_old_pool['ancestors']])
+        backtrace_new_pool = self.fs.read_backtrace(file_ino, pool=new_pool_name)
+        self.assertEqual(['alpha', 'parent_c'], [a['dname'] for a in backtrace_new_pool['ancestors']])
+
+        # That layout is written to new pool after change to other field in layout
+        self.mount_a.setfattr("./parent_c/alpha",
+                              "ceph.file.layout.object_size", "8388608")
+
+        self.fs.mds_asok(["flush", "journal"])
+        new_pool_layout = self.fs.read_layout(file_ino, pool=new_pool_name)
+        self.assertEqual(new_pool_layout['object_size'], 8388608)
+
+        # ...but not to the old pool: the old pool's backtrace points to the new pool, and that's enough,
+        # we don't update the layout in all the old pools whenever it changes
+        old_pool_layout = self.fs.read_layout(file_ino, pool=old_data_pool_name)
+        self.assertEqual(old_pool_layout['object_size'], 4194304)
diff --git a/qa/tasks/cephfs/test_cap_flush.py b/qa/tasks/cephfs/test_cap_flush.py
new file mode 100644
index 000000000..c472e85bd
--- /dev/null
+++ b/qa/tasks/cephfs/test_cap_flush.py
@@ -0,0 +1,58 @@
+
+import os
+import time
+from textwrap import dedent
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+
+class TestCapFlush(CephFSTestCase):
+    @for_teuthology
+    def test_replay_create(self):
+        """
+        MDS starts to handle client caps when it enters clientreplay stage.
+        When handling a client cap in clientreplay stage, it's possible that
+        corresponding inode does not exist because the client request which
+        creates inode hasn't been replayed.
+        """
+
+        dir_path = os.path.join(self.mount_a.mountpoint, "testdir")
+        py_script = dedent("""
+            import os
+            os.mkdir("{0}")
+            fd = os.open("{0}", os.O_RDONLY)
+            os.fchmod(fd, 0o777)
+            os.fsync(fd)
+            """).format(dir_path)
+        self.mount_a.run_python(py_script)
+
+        self.fs.mds_asok(["flush", "journal"])
+
+        # client will only get unsafe replay
+        self.fs.mds_asok(["config", "set", "mds_log_pause", "1"])
+
+        file_name = "testfile"
+        file_path = dir_path + "/" + file_name
+
+        # Create a file and modify its mode. ceph-fuse will mark Ax cap dirty
+        py_script = dedent("""
+            import os
+            os.chdir("{0}")
+            os.setgid(65534)
+            os.setuid(65534)
+            fd = os.open("{1}", os.O_CREAT | os.O_RDWR, 0o644)
+            os.fchmod(fd, 0o640)
+            """).format(dir_path, file_name)
+        self.mount_a.run_python(py_script, sudo=True)
+
+        # Modify file mode by different user. ceph-fuse will send a setattr request
+        self.mount_a.run_shell(["chmod", "600", file_path], wait=False, sudo=True)
+
+        time.sleep(10)
+
+        # Restart mds. Client will re-send the unsafe request and cap flush
+        self.fs.rank_fail()
+        self.fs.wait_for_daemons()
+
+        mode = self.mount_a.run_shell(['stat', '-c' '%a', file_path]).stdout.getvalue().strip()
+        # If the cap flush get dropped, mode should be 0644.
+        # (Ax cap stays in dirty state, which prevents setattr reply from updating file mode)
+        self.assertEqual(mode, "600")
diff --git a/qa/tasks/cephfs/test_cephfs_shell.py b/qa/tasks/cephfs/test_cephfs_shell.py
new file mode 100644
index 000000000..8995d260f
--- /dev/null
+++ b/qa/tasks/cephfs/test_cephfs_shell.py
@@ -0,0 +1,1028 @@
+"""
+Before running this testsuite, add path to cephfs-shell module to $PATH and
+export $PATH.
+"""
+from io import StringIO
+from os import path
+import crypt
+import logging
+from tempfile import mkstemp as tempfile_mkstemp
+import math
+from time import sleep
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.orchestra.run import CommandFailedError
+
+log = logging.getLogger(__name__)
+
+def humansize(nbytes):
+    suffixes = ['B', 'K', 'M', 'G', 'T', 'P']
+    i = 0
+    while nbytes >= 1024 and i < len(suffixes)-1:
+        nbytes /= 1024.
+        i += 1
+    nbytes = math.ceil(nbytes)
+    f = ('%d' % nbytes).rstrip('.')
+    return '%s%s' % (f, suffixes[i])
+
+def ensure_str(s):
+    if isinstance(s, str):
+        return s
+    if isinstance(s, bytes):
+        return s.decode()
+    raise TypeError("not expecting type '%s'" % type(s))
+    
+class TestCephFSShell(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+
+    def setUp(self):
+        super(TestCephFSShell, self).setUp()
+
+        conf_contents = "[cephfs-shell]\ncolors = False\ndebug = True\n"
+        confpath = self.mount_a.client_remote.sh('mktemp').strip()
+        self.mount_a.client_remote.write_file(confpath, conf_contents)
+        self.default_shell_conf_path = confpath
+
+    def run_cephfs_shell_cmd(self, cmd, mount_x=None, shell_conf_path=None,
+                             opts=None, stdout=None, stderr=None, stdin=None,
+                             check_status=True):
+        stdout = stdout or StringIO()
+        stderr = stderr or StringIO()
+        if mount_x is None:
+            mount_x = self.mount_a
+        if isinstance(cmd, list):
+            cmd = " ".join(cmd)
+        if not shell_conf_path:
+            shell_conf_path = self.default_shell_conf_path
+
+        args = ["cephfs-shell", "-c", shell_conf_path]
+        if opts:
+            args += opts
+        args.extend(("--", cmd))
+
+        log.info("Running command: {}".format(" ".join(args)))
+        return mount_x.client_remote.run(args=args, stdout=stdout,
+                                         stderr=stderr, stdin=stdin,
+                                         check_status=check_status)
+
+    def negtest_cephfs_shell_cmd(self, **kwargs):
+        """
+        This method verifies that cephfs shell command fails with expected
+        return value and/or error message.
+
+        kwargs is expected to hold the arguments same as
+        run_cephfs_shell_cmd() with the following exceptions -
+            * It should not contain check_status (since commands are expected
+              to fail, check_status is hardcoded to False).
+            * It is optional to set expected error message and return value to
+              dict members 'errmsg' and 'retval' respectively.
+
+        This method servers as shorthand for codeblocks like -
+
+        try:
+            proc = self.run_cephfs_shell_cmd(args=['some', 'cmd'],
+                                             check_status=False,
+                                             stdout=stdout)
+        except CommandFailedError as e:
+            self.assertNotIn('some error message',
+                              proc.stderr.getvalue.lower())
+
+
+        try:
+            proc = self.run_cephfs_shell_cmd(args=['some', 'cmd'],
+                                             check_status=False,
+                                             stdout=stdout)
+        except CommandFailedError as e:
+            self.assertNotEqual(1, proc.returncode)
+        """
+        retval = kwargs.pop('retval', None)
+        errmsg = kwargs.pop('errmsg', None)
+        kwargs['check_status'] = False
+
+        proc = self.run_cephfs_shell_cmd(**kwargs)
+        if retval:
+            self.assertEqual(proc.returncode, retval)
+        else:
+            self.assertNotEqual(proc.returncode, 0)
+        if errmsg:
+            self.assertIn(errmsg, proc.stderr.getvalue().lower())
+
+        return proc
+
+    def get_cephfs_shell_cmd_output(self, cmd, mount_x=None,
+                                    shell_conf_path=None, opts=None,
+                                    stdout=None, stdin=None,check_status=True):
+        return ensure_str(self.run_cephfs_shell_cmd(
+            cmd=cmd, mount_x=mount_x, shell_conf_path=shell_conf_path,
+            opts=opts, stdout=stdout, stdin=stdin,
+            check_status=check_status).stdout.getvalue().strip())
+
+    def get_cephfs_shell_cmd_error(self, cmd, mount_x=None,
+                                   shell_conf_path=None, opts=None,
+                                   stderr=None, stdin=None, check_status=True):
+        return ensure_str(self.run_cephfs_shell_cmd(
+            cmd=cmd, mount_x=mount_x, shell_conf_path=shell_conf_path,
+            opts=opts, stderr=stderr, stdin=stdin,
+            check_status=check_status).stderr.getvalue().strip())
+
+    def run_cephfs_shell_script(self, script, mount_x=None,
+                                shell_conf_path=None, opts=None, stdout=None,
+                                stderr=None, stdin=None, check_status=True):
+        stdout = stdout or StringIO()
+        stderr = stderr or StringIO()
+        if mount_x is None:
+            mount_x = self.mount_a
+
+        scriptpath = tempfile_mkstemp(prefix='test-cephfs', text=True)[1]
+        with open(scriptpath, 'w') as scriptfile:
+            scriptfile.write(script)
+        # copy script to the machine running cephfs-shell.
+        mount_x.client_remote.put_file(scriptpath, scriptpath)
+        mount_x.run_shell_payload(f"chmod 755 {scriptpath}")
+
+        args = ["cephfs-shell", '-b', scriptpath]
+        if shell_conf_path:
+            args[1:1] = ["-c", shell_conf_path]
+        log.info('Running script \"' + scriptpath + '\"')
+        return mount_x.client_remote.run(args=args, stdout=stdout,
+                                         stderr=stderr, stdin=stdin,
+                                         check_status=True)
+
+    def get_cephfs_shell_script_output(self, script, mount_x=None,
+                                       shell_conf_path=None, opts=None,
+                                       stdout=None, stdin=None,
+                                       check_status=True):
+        return ensure_str(self.run_cephfs_shell_script(
+            script=script, mount_x=mount_x, shell_conf_path=shell_conf_path,
+            opts=opts, stdout=stdout, stdin=stdin,
+            check_status=check_status).stdout.getvalue().strip())
+
+
+class TestMkdir(TestCephFSShell):
+    def test_mkdir(self):
+        """
+        Test that mkdir creates directory
+        """
+        o = self.get_cephfs_shell_cmd_output("mkdir d1")
+        log.info("cephfs-shell output:\n{}".format(o))
+
+        o = self.mount_a.stat('d1')
+        log.info("mount_a output:\n{}".format(o))
+
+    def test_mkdir_with_07000_octal_mode(self):
+        """
+        Test that mkdir fails with octal mode greater than 0777
+        """
+        self.negtest_cephfs_shell_cmd(cmd="mkdir -m 07000 d2")
+        try:
+            self.mount_a.stat('d2')
+        except CommandFailedError:
+            pass
+
+    def test_mkdir_with_negative_octal_mode(self):
+        """
+        Test that mkdir fails with negative octal mode
+        """
+        self.negtest_cephfs_shell_cmd(cmd="mkdir -m -0755 d3")
+        try:
+            self.mount_a.stat('d3')
+        except CommandFailedError:
+            pass
+
+    def test_mkdir_with_non_octal_mode(self):
+        """
+        Test that mkdir passes with non-octal mode
+        """
+        o = self.get_cephfs_shell_cmd_output("mkdir -m u=rwx d4")
+        log.info("cephfs-shell output:\n{}".format(o))
+
+        # mkdir d4 should pass
+        o = self.mount_a.stat('d4')
+        assert((o['st_mode'] & 0o700) == 0o700)
+
+    def test_mkdir_with_bad_non_octal_mode(self):
+        """
+        Test that mkdir failes with bad non-octal mode
+        """
+        self.negtest_cephfs_shell_cmd(cmd="mkdir -m ugx=0755 d5")
+        try:
+            self.mount_a.stat('d5')
+        except CommandFailedError:
+            pass
+
+    def test_mkdir_path_without_path_option(self):
+        """
+        Test that mkdir fails without path option for creating path
+        """
+        self.negtest_cephfs_shell_cmd(cmd="mkdir d5/d6/d7")
+        try:
+            self.mount_a.stat('d5/d6/d7')
+        except CommandFailedError:
+            pass
+
+    def test_mkdir_path_with_path_option(self):
+        """
+        Test that mkdir passes with path option for creating path
+        """
+        o = self.get_cephfs_shell_cmd_output("mkdir -p d5/d6/d7")
+        log.info("cephfs-shell output:\n{}".format(o))
+
+        # mkdir d5/d6/d7 should pass
+        o = self.mount_a.stat('d5/d6/d7')
+        log.info("mount_a output:\n{}".format(o))
+
+class TestRmdir(TestCephFSShell):
+    dir_name = "test_dir"
+
+    def dir_does_not_exists(self):
+        """
+        Tests that directory does not exists
+        """
+        try:
+            self.mount_a.stat(self.dir_name)
+        except CommandFailedError as e:
+            if  e.exitstatus == 2:
+                return 0
+            raise
+
+    def test_rmdir(self):
+        """
+        Test that rmdir deletes directory
+        """
+        self.run_cephfs_shell_cmd("mkdir " + self.dir_name)
+        self.run_cephfs_shell_cmd("rmdir "+ self.dir_name)
+        self.dir_does_not_exists()
+
+    def test_rmdir_non_existing_dir(self):
+        """
+        Test that rmdir does not delete a non existing directory
+        """
+        self.negtest_cephfs_shell_cmd(cmd="rmdir test_dir")
+        self.dir_does_not_exists()
+
+    def test_rmdir_dir_with_file(self):
+        """
+        Test that rmdir does not delete directory containing file
+        """
+        self.run_cephfs_shell_cmd("mkdir " + self.dir_name)
+        self.run_cephfs_shell_cmd("put - test_dir/dumpfile", stdin="Valid File")
+        self.run_cephfs_shell_cmd("rmdir" + self.dir_name)
+        self.mount_a.stat(self.dir_name)
+
+    def test_rmdir_existing_file(self):
+        """
+        Test that rmdir does not delete a file
+        """
+        self.run_cephfs_shell_cmd("put - dumpfile", stdin="Valid File")
+        self.negtest_cephfs_shell_cmd(cmd="rmdir dumpfile")
+        self.mount_a.stat("dumpfile")
+
+    def test_rmdir_p(self):
+        """
+        Test that rmdir -p deletes all empty directories in the root directory passed
+        """
+        self.run_cephfs_shell_cmd("mkdir -p test_dir/t1/t2/t3")
+        self.run_cephfs_shell_cmd("rmdir -p "+ self.dir_name)
+        self.dir_does_not_exists()
+
+    def test_rmdir_p_valid_path(self):
+        """
+        Test that rmdir -p deletes all empty directories in the path passed
+        """
+        self.run_cephfs_shell_cmd("mkdir -p test_dir/t1/t2/t3")
+        self.run_cephfs_shell_cmd("rmdir -p test_dir/t1/t2/t3")
+        self.dir_does_not_exists()
+
+    def test_rmdir_p_non_existing_dir(self):
+        """
+        Test that rmdir -p does not delete an invalid directory
+        """
+        self.negtest_cephfs_shell_cmd(cmd="rmdir -p test_dir")
+        self.dir_does_not_exists()
+
+    def test_rmdir_p_dir_with_file(self):
+        """
+        Test that rmdir -p does not delete the directory containing a file
+        """
+        self.run_cephfs_shell_cmd("mkdir " + self.dir_name)
+        self.run_cephfs_shell_cmd("put - test_dir/dumpfile", stdin="Valid File")
+        self.run_cephfs_shell_cmd("rmdir -p " + self.dir_name)
+        self.mount_a.stat(self.dir_name)
+
+class TestGetAndPut(TestCephFSShell):
+    def test_get_with_target_name(self):
+        """
+        Test that get passes with target name
+        """
+        s = 'C' * 1024
+        s_hash = crypt.crypt(s, '.A')
+        o = self.get_cephfs_shell_cmd_output("put - dump4", stdin=s)
+        log.info("cephfs-shell output:\n{}".format(o))
+
+        # put - dump4 should pass
+        o = self.mount_a.stat('dump4')
+        log.info("mount_a output:\n{}".format(o))
+
+        o = self.get_cephfs_shell_cmd_output("get dump4 ./dump4")
+        log.info("cephfs-shell output:\n{}".format(o))
+
+        o = self.get_cephfs_shell_cmd_output("!cat dump4")
+        o_hash = crypt.crypt(o, '.A')
+
+        # s_hash must be equal to o_hash
+        log.info("s_hash:{}".format(s_hash))
+        log.info("o_hash:{}".format(o_hash))
+        assert(s_hash == o_hash)
+
+    def test_get_without_target_name(self):
+        """
+        Test that get should fail when there is no target name
+        """
+        s = 'Somedata'
+        # put - dump5 should pass
+        self.get_cephfs_shell_cmd_output("put - dump5", stdin=s)
+
+        self.mount_a.stat('dump5')
+
+        # get dump5 should fail as there is no local_path mentioned
+        with self.assertRaises(CommandFailedError):
+            self.get_cephfs_shell_cmd_output("get dump5")
+
+        # stat dump would return non-zero exit code as get dump failed
+        # cwd=None because we want to run it at CWD, not at cephfs mntpt.
+        r = self.mount_a.run_shell('stat dump5', cwd=None,
+                                   check_status=False).returncode
+        self.assertEqual(r, 1)
+
+    def test_get_doesnt_create_dir(self):
+        # if get cmd is creating subdirs on its own then dump7 will be
+        # stored as ./dump7/tmp/dump7 and not ./dump7, therefore
+        # if doing `cat ./dump7` returns non-zero exit code(i.e. 1) then
+        # it implies that no such file exists at that location
+        dir_abspath = path.join(self.mount_a.mountpoint, 'tmp')
+        self.mount_a.run_shell_payload(f"mkdir {dir_abspath}")
+        self.mount_a.client_remote.write_file(path.join(dir_abspath, 'dump7'),
+                                              'somedata')
+        self.get_cephfs_shell_cmd_output("get /tmp/dump7 ./dump7")
+        # test that dump7 exists
+        self.mount_a.run_shell("cat ./dump7", cwd=None)
+
+    def test_get_to_console(self):
+        """
+        Test that get passes with target name
+        """
+        s = 'E' * 1024
+        s_hash = crypt.crypt(s, '.A')
+        o = self.get_cephfs_shell_cmd_output("put - dump6", stdin=s)
+        log.info("cephfs-shell output:\n{}".format(o))
+
+        # put - dump6 should pass
+        o = self.mount_a.stat('dump6')
+        log.info("mount_a output:\n{}".format(o))
+
+        # get dump6 - should pass
+        o = self.get_cephfs_shell_cmd_output("get dump6 -")
+        o_hash = crypt.crypt(o, '.A')
+        log.info("cephfs-shell output:\n{}".format(o))
+
+        # s_hash must be equal to o_hash
+        log.info("s_hash:{}".format(s_hash))
+        log.info("o_hash:{}".format(o_hash))
+        assert(s_hash == o_hash)
+
+    def test_put_without_target_name(self):
+        """
+        put - should fail as the cmd expects both arguments are mandatory.
+        """
+        with self.assertRaises(CommandFailedError):
+            self.get_cephfs_shell_cmd_output("put -")
+
+    def test_put_validate_local_path(self):
+        """
+        This test is intended to make sure local_path is validated before
+        trying to put the file from local fs to cephfs and the command
+        put ./dumpXYZ dump8 would fail as dumpXYX doesn't exist.
+        """
+        with self.assertRaises(CommandFailedError):
+            o = self.get_cephfs_shell_cmd_output("put ./dumpXYZ dump8")
+            log.info("cephfs-shell output:\n{}".format(o))
+
+class TestSnapshots(TestCephFSShell):
+    def test_snap(self):
+        """
+        Test that snapshot creation and deletion work
+        """
+        sd = self.fs.get_config('client_snapdir')
+        sdn = "data_dir/{}/snap1".format(sd)
+
+        # create a data dir and dump some files into it
+        self.get_cephfs_shell_cmd_output("mkdir data_dir")
+        s = 'A' * 10240
+        o = self.get_cephfs_shell_cmd_output("put - data_dir/data_a", stdin=s)
+        s = 'B' * 10240
+        o = self.get_cephfs_shell_cmd_output("put - data_dir/data_b", stdin=s)
+        s = 'C' * 10240
+        o = self.get_cephfs_shell_cmd_output("put - data_dir/data_c", stdin=s)
+        s = 'D' * 10240
+        o = self.get_cephfs_shell_cmd_output("put - data_dir/data_d", stdin=s)
+        s = 'E' * 10240
+        o = self.get_cephfs_shell_cmd_output("put - data_dir/data_e", stdin=s)
+
+        o = self.get_cephfs_shell_cmd_output("ls -l /data_dir")
+        log.info("cephfs-shell output:\n{}".format(o))
+
+        # create the snapshot - must pass
+        o = self.get_cephfs_shell_cmd_output("snap create snap1 /data_dir")
+        log.info("cephfs-shell output:\n{}".format(o))
+        self.assertEqual("", o)
+        o = self.mount_a.stat(sdn)
+        log.info("mount_a output:\n{}".format(o))
+        self.assertIn('st_mode', o)
+
+        # create the same snapshot again - must fail with an error message
+        self.negtest_cephfs_shell_cmd(cmd="snap create snap1 /data_dir",
+                                      errmsg="snapshot 'snap1' already exists")
+        o = self.mount_a.stat(sdn)
+        log.info("mount_a output:\n{}".format(o))
+        self.assertIn('st_mode', o)
+
+        # delete the snapshot - must pass
+        o = self.get_cephfs_shell_cmd_output("snap delete snap1 /data_dir")
+        log.info("cephfs-shell output:\n{}".format(o))
+        self.assertEqual("", o)
+        try:
+            o = self.mount_a.stat(sdn)
+        except CommandFailedError:
+            # snap dir should not exist anymore
+            pass
+        log.info("mount_a output:\n{}".format(o))
+        self.assertNotIn('st_mode', o)
+
+        # delete the same snapshot again - must fail with an error message
+        self.negtest_cephfs_shell_cmd(cmd="snap delete snap1 /data_dir",
+                                      errmsg="'snap1': no such snapshot")
+        try:
+            o = self.mount_a.stat(sdn)
+        except CommandFailedError:
+            pass
+        log.info("mount_a output:\n{}".format(o))
+        self.assertNotIn('st_mode', o)
+
+class TestCD(TestCephFSShell):
+    CLIENTS_REQUIRED = 1
+
+    def test_cd_with_no_args(self):
+        """
+        Test that when cd is issued without any arguments, CWD is changed
+        to root directory.
+        """
+        path = 'dir1/dir2/dir3'
+        self.mount_a.run_shell_payload(f"mkdir -p {path}")
+        expected_cwd = '/'
+
+        script = 'cd {}\ncd\ncwd\n'.format(path)
+        output = self.get_cephfs_shell_script_output(script)
+        self.assertEqual(output, expected_cwd)
+
+    def test_cd_with_args(self):
+        """
+        Test that when cd is issued with an argument, CWD is changed
+        to the path passed in the argument.
+        """
+        path = 'dir1/dir2/dir3'
+        self.mount_a.run_shell_payload(f"mkdir -p {path}")
+        expected_cwd = '/dir1/dir2/dir3'
+
+        script = 'cd {}\ncwd\n'.format(path)
+        output = self.get_cephfs_shell_script_output(script)
+        self.assertEqual(output, expected_cwd)
+
+class TestDU(TestCephFSShell):
+    CLIENTS_REQUIRED = 1
+
+    def test_du_works_for_regfiles(self):
+        regfilename = 'some_regfile'
+        regfile_abspath = path.join(self.mount_a.mountpoint, regfilename)
+        self.mount_a.client_remote.write_file(regfile_abspath, 'somedata')
+
+        size = humansize(self.mount_a.stat(regfile_abspath)['st_size'])
+        expected_output = r'{}{}{}'.format(size, " +", regfilename)
+
+        du_output = self.get_cephfs_shell_cmd_output('du ' + regfilename)
+        self.assertRegex(du_output, expected_output)
+
+    def test_du_works_for_non_empty_dirs(self):
+        dirname = 'some_directory'
+        dir_abspath = path.join(self.mount_a.mountpoint, dirname)
+        regfilename = 'some_regfile'
+        regfile_abspath = path.join(dir_abspath, regfilename)
+        self.mount_a.run_shell_payload(f"mkdir {dir_abspath}")
+        self.mount_a.client_remote.write_file(regfile_abspath, 'somedata')
+
+        # XXX: we stat `regfile_abspath` here because ceph du reports a non-empty
+        # directory's size as sum of sizes of all files under it.
+        size = humansize(self.mount_a.stat(regfile_abspath)['st_size'])
+        expected_output = r'{}{}{}'.format(size, " +", dirname)
+
+        sleep(10)
+        du_output = self.get_cephfs_shell_cmd_output('du ' + dirname)
+        self.assertRegex(du_output, expected_output)
+
+    def test_du_works_for_empty_dirs(self):
+        dirname = 'some_directory'
+        dir_abspath = path.join(self.mount_a.mountpoint, dirname)
+        self.mount_a.run_shell_payload(f"mkdir {dir_abspath}")
+
+        size = humansize(self.mount_a.stat(dir_abspath)['st_size'])
+        expected_output = r'{}{}{}'.format(size, " +", dirname)
+
+        du_output = self.get_cephfs_shell_cmd_output('du ' + dirname)
+        self.assertRegex(du_output, expected_output)
+
+    def test_du_works_for_hardlinks(self):
+        regfilename = 'some_regfile'
+        regfile_abspath = path.join(self.mount_a.mountpoint, regfilename)
+        self.mount_a.client_remote.write_file(regfile_abspath, 'somedata')
+        hlinkname = 'some_hardlink'
+        hlink_abspath = path.join(self.mount_a.mountpoint, hlinkname)
+        self.mount_a.run_shell_payload(f"ln {regfile_abspath} {hlink_abspath}")
+
+        size = humansize(self.mount_a.stat(hlink_abspath)['st_size'])
+        expected_output = r'{}{}{}'.format(size, " +", hlinkname)
+
+        du_output = self.get_cephfs_shell_cmd_output('du ' + hlinkname)
+        self.assertRegex(du_output, expected_output)
+
+    def test_du_works_for_softlinks_to_files(self):
+        regfilename = 'some_regfile'
+        regfile_abspath = path.join(self.mount_a.mountpoint, regfilename)
+        self.mount_a.client_remote.write_file(regfile_abspath, 'somedata')
+        slinkname = 'some_softlink'
+        slink_abspath = path.join(self.mount_a.mountpoint, slinkname)
+        self.mount_a.run_shell_payload(f"ln -s {regfile_abspath} {slink_abspath}")
+
+        size = humansize(self.mount_a.lstat(slink_abspath)['st_size'])
+        expected_output = r'{}{}{}'.format((size), " +", slinkname)
+
+        du_output = self.get_cephfs_shell_cmd_output('du ' + slinkname)
+        self.assertRegex(du_output, expected_output)
+
+    def test_du_works_for_softlinks_to_dirs(self):
+        dirname = 'some_directory'
+        dir_abspath = path.join(self.mount_a.mountpoint, dirname)
+        self.mount_a.run_shell_payload(f"mkdir {dir_abspath}")
+        slinkname = 'some_softlink'
+        slink_abspath = path.join(self.mount_a.mountpoint, slinkname)
+        self.mount_a.run_shell_payload(f"ln -s {dir_abspath} {slink_abspath}")
+
+        size = humansize(self.mount_a.lstat(slink_abspath)['st_size'])
+        expected_output = r'{}{}{}'.format(size, " +", slinkname)
+
+        du_output = self.get_cephfs_shell_cmd_output('du ' + slinkname)
+        self.assertRegex(du_output, expected_output)
+
+    # NOTE: tests using these are pretty slow since to this methods sleeps for
+    # 15 seconds
+    def _setup_files(self, return_path_to_files=False, path_prefix='./'):
+        dirname = 'dir1'
+        regfilename = 'regfile'
+        hlinkname = 'hlink'
+        slinkname = 'slink1'
+        slink2name = 'slink2'
+
+        dir_abspath = path.join(self.mount_a.mountpoint, dirname)
+        regfile_abspath = path.join(self.mount_a.mountpoint, regfilename)
+        hlink_abspath = path.join(self.mount_a.mountpoint, hlinkname)
+        slink_abspath = path.join(self.mount_a.mountpoint, slinkname)
+        slink2_abspath = path.join(self.mount_a.mountpoint, slink2name)
+
+        self.mount_a.run_shell_payload(f"mkdir {dir_abspath}")
+        self.mount_a.run_shell_payload(f"touch {regfile_abspath}")
+        self.mount_a.run_shell_payload(f"ln {regfile_abspath} {hlink_abspath}")
+        self.mount_a.run_shell_payload(f"ln -s {regfile_abspath} {slink_abspath}")
+        self.mount_a.run_shell_payload(f"ln -s {dir_abspath} {slink2_abspath}")
+
+        dir2_name = 'dir2'
+        dir21_name = 'dir21'
+        regfile121_name = 'regfile121'
+        dir2_abspath = path.join(self.mount_a.mountpoint, dir2_name)
+        dir21_abspath = path.join(dir2_abspath, dir21_name)
+        regfile121_abspath = path.join(dir21_abspath, regfile121_name)
+        self.mount_a.run_shell_payload(f"mkdir -p {dir21_abspath}")
+        self.mount_a.run_shell_payload(f"touch {regfile121_abspath}")
+
+        self.mount_a.client_remote.write_file(regfile_abspath, 'somedata')
+        self.mount_a.client_remote.write_file(regfile121_abspath, 'somemoredata')
+
+        # TODO: is there a way to trigger/force update ceph.dir.rbytes?
+        # wait so that attr ceph.dir.rbytes gets a chance to be updated.
+        sleep(20)
+
+        expected_patterns = []
+        path_to_files = []
+
+        def append_expected_output_pattern(f):
+            if f == '/':
+                expected_patterns.append(r'{}{}{}'.format(size, " +", '.' + f))
+            else:
+                expected_patterns.append(r'{}{}{}'.format(size, " +",
+                    path_prefix + path.relpath(f, self.mount_a.mountpoint)))
+
+        for f in [dir_abspath, regfile_abspath, regfile121_abspath,
+                  hlink_abspath, slink_abspath, slink2_abspath]:
+            size = humansize(self.mount_a.stat(f, follow_symlinks=
+                                               False)['st_size'])
+            append_expected_output_pattern(f)
+
+        # get size for directories containig regfiles within
+        for f in [dir2_abspath, dir21_abspath]:
+            size = humansize(self.mount_a.stat(regfile121_abspath,
+                             follow_symlinks=False)['st_size'])
+            append_expected_output_pattern(f)
+
+        # get size for CephFS root
+        size = 0
+        for f in [regfile_abspath, regfile121_abspath, slink_abspath,
+                  slink2_abspath]:
+            size += self.mount_a.stat(f, follow_symlinks=False)['st_size']
+        size = humansize(size)
+        append_expected_output_pattern('/')
+
+        if return_path_to_files:
+            for p in [dir_abspath, regfile_abspath, dir2_abspath,
+                      dir21_abspath, regfile121_abspath, hlink_abspath,
+                      slink_abspath, slink2_abspath]:
+                 path_to_files.append(path.relpath(p, self.mount_a.mountpoint))
+
+            return (expected_patterns, path_to_files)
+        else:
+            return expected_patterns
+
+    def test_du_works_recursively_with_no_path_in_args(self):
+        expected_patterns_in_output = self._setup_files()
+        du_output = self.get_cephfs_shell_cmd_output('du -r')
+
+        for expected_output in expected_patterns_in_output:
+            self.assertRegex(du_output, expected_output)
+
+    def test_du_with_path_in_args(self):
+        expected_patterns_in_output, path_to_files = self._setup_files(True,
+            path_prefix='')
+
+        args = ['du', '/']
+        for p in path_to_files:
+            args.append(p)
+        du_output = self.get_cephfs_shell_cmd_output(args)
+
+        for expected_output in expected_patterns_in_output:
+            self.assertRegex(du_output, expected_output)
+
+    def test_du_with_no_args(self):
+        expected_patterns_in_output = self._setup_files()
+
+        du_output = self.get_cephfs_shell_cmd_output('du')
+
+        for expected_output in expected_patterns_in_output:
+            # Since CWD is CephFS root and being non-recursive expect only
+            # CWD in DU report.
+            if expected_output.find('/') == len(expected_output) - 1:
+                self.assertRegex(du_output, expected_output)
+
+
+class TestDF(TestCephFSShell):
+    def validate_df(self, filename):
+        df_output = self.get_cephfs_shell_cmd_output('df '+filename)
+        log.info("cephfs-shell df output:\n{}".format(df_output))
+
+        shell_df = df_output.splitlines()[1].split()
+
+        block_size = int(self.mount_a.df()["total"]) // 1024
+        log.info("cephfs df block size output:{}\n".format(block_size))
+
+        st_size = int(self.mount_a.stat(filename)["st_size"])
+        log.info("cephfs stat used output:{}".format(st_size))
+        log.info("cephfs available:{}\n".format(block_size - st_size))
+
+        self.assertTupleEqual((block_size, st_size, block_size - st_size),
+            (int(shell_df[0]), int(shell_df[1]) , int(shell_df[2])))
+
+    def test_df_with_no_args(self):
+        expected_output = ''
+        df_output = self.get_cephfs_shell_cmd_output('df')
+        assert df_output == expected_output
+
+    def test_df_for_valid_directory(self):
+        dir_name = 'dir1'
+        mount_output = self.mount_a.run_shell_payload(f"mkdir {dir_name}")
+        log.info("cephfs-shell mount output:\n{}".format(mount_output))
+        self.validate_df(dir_name)
+
+    def test_df_for_invalid_directory(self):
+        dir_abspath = path.join(self.mount_a.mountpoint, 'non-existent-dir')
+        self.negtest_cephfs_shell_cmd(cmd='df ' + dir_abspath,
+                                      errmsg='error in stat')
+
+    def test_df_for_valid_file(self):
+        s = 'df test' * 14145016
+        o = self.get_cephfs_shell_cmd_output("put - dumpfile", stdin=s)
+        log.info("cephfs-shell output:\n{}".format(o))
+        self.validate_df("dumpfile")
+
+
+class TestQuota(TestCephFSShell):
+    dir_name = 'testdir'
+
+    def create_dir(self):
+        mount_output = self.get_cephfs_shell_cmd_output('mkdir ' + self.dir_name)
+        log.info("cephfs-shell mount output:\n{}".format(mount_output))
+
+    def set_and_get_quota_vals(self, input_val, check_status=True):
+        self.run_cephfs_shell_cmd(['quota', 'set', '--max_bytes',
+                                   input_val[0], '--max_files', input_val[1],
+                                   self.dir_name], check_status=check_status)
+
+        quota_output = self.get_cephfs_shell_cmd_output(['quota', 'get', self.dir_name],
+                                                        check_status=check_status)
+
+        quota_output = quota_output.split()
+        return quota_output[1], quota_output[3]
+
+    def test_set(self):
+        self.create_dir()
+        set_values = ('6', '2')
+        self.assertTupleEqual(self.set_and_get_quota_vals(set_values), set_values)
+
+    def test_replace_values(self):
+        self.test_set()
+        set_values = ('20', '4')
+        self.assertTupleEqual(self.set_and_get_quota_vals(set_values), set_values)
+
+    def test_set_invalid_dir(self):
+        set_values = ('5', '5')
+        try:
+            self.assertTupleEqual(self.set_and_get_quota_vals(
+                                  set_values, False), set_values)
+            raise Exception("Something went wrong!! Values set for non existing directory")
+        except IndexError:
+            # Test should pass as values cannot be set for non existing directory
+            pass
+
+    def test_set_invalid_values(self):
+        self.create_dir()
+        set_values = ('-6', '-5')
+        try:
+            self.assertTupleEqual(self.set_and_get_quota_vals(set_values,
+                                  False), set_values)
+            raise Exception("Something went wrong!! Invalid values set")
+        except IndexError:
+            # Test should pass as invalid values cannot be set
+            pass
+
+    def test_exceed_file_limit(self):
+        self.test_set()
+        dir_abspath = path.join(self.mount_a.mountpoint, self.dir_name)
+        self.mount_a.run_shell_payload(f"touch {dir_abspath}/file1")
+        file2 = path.join(dir_abspath, "file2")
+        try:
+            self.mount_a.run_shell_payload(f"touch {file2}")
+            raise Exception("Something went wrong!! File creation should have failed")
+        except CommandFailedError:
+            # Test should pass as file quota set to 2
+            # Additional condition to confirm file creation failure
+            if not path.exists(file2):
+                return 0
+            raise
+
+    def test_exceed_write_limit(self):
+        self.test_set()
+        dir_abspath = path.join(self.mount_a.mountpoint, self.dir_name)
+        filename = 'test_file'
+        file_abspath = path.join(dir_abspath, filename)
+        try:
+            # Write should fail as bytes quota is set to 6
+            self.mount_a.client_remote.write_file(file_abspath, 'Disk raise Exception')
+            raise Exception("Write should have failed")
+        except CommandFailedError:
+            # Test should pass only when write command fails
+            path_exists = path.exists(file_abspath)
+            if not path_exists:
+                # Testing with teuthology: No file is created.
+                return 0
+            elif path_exists and not path.getsize(file_abspath):
+                # Testing on Fedora 30: When write fails, empty file gets created.
+                return 0
+            else:
+                raise
+
+
+class TestXattr(TestCephFSShell):
+    dir_name = 'testdir'
+
+    def create_dir(self):
+        self.run_cephfs_shell_cmd('mkdir ' + self.dir_name)
+
+    def set_get_list_xattr_vals(self, input_val, negtest=False):
+        setxattr_output = self.get_cephfs_shell_cmd_output(
+            ['setxattr', self.dir_name, input_val[0], input_val[1]])
+        log.info("cephfs-shell setxattr output:\n{}".format(setxattr_output))
+
+        getxattr_output = self.get_cephfs_shell_cmd_output(
+            ['getxattr', self.dir_name, input_val[0]])
+        log.info("cephfs-shell getxattr output:\n{}".format(getxattr_output))
+
+        listxattr_output = self.get_cephfs_shell_cmd_output(
+            ['listxattr', self.dir_name])
+        log.info("cephfs-shell listxattr output:\n{}".format(listxattr_output))
+
+        return listxattr_output, getxattr_output
+
+    def test_set(self):
+        self.create_dir()
+        set_values = ('user.key', '2')
+        self.assertTupleEqual(self.set_get_list_xattr_vals(set_values), set_values)
+
+    def test_reset(self):
+        self.test_set()
+        set_values = ('user.key', '4')
+        self.assertTupleEqual(self.set_get_list_xattr_vals(set_values), set_values)
+
+    def test_non_existing_dir(self):
+        input_val = ('user.key', '9')
+        self.negtest_cephfs_shell_cmd(cmd=['setxattr', self.dir_name, input_val[0],
+                                       input_val[1]])
+        self.negtest_cephfs_shell_cmd(cmd=['getxattr', self.dir_name, input_val[0]])
+        self.negtest_cephfs_shell_cmd(cmd=['listxattr', self.dir_name])
+
+class TestLS(TestCephFSShell):
+    dir_name = ('test_dir')
+    hidden_dir_name = ('.test_hidden_dir')
+
+    def test_ls(self):
+        """ Test that ls prints files in CWD. """
+        self.run_cephfs_shell_cmd(f'mkdir {self.dir_name}')
+
+        ls_output = self.get_cephfs_shell_cmd_output("ls")
+        log.info(f"output of ls command:\n{ls_output}")
+
+        self.assertIn(self.dir_name, ls_output)
+
+    def test_ls_a(self):
+        """ Test ls -a prints hidden files in CWD."""
+
+        self.run_cephfs_shell_cmd(f'mkdir {self.hidden_dir_name}')
+
+        ls_a_output = self.get_cephfs_shell_cmd_output(['ls', '-a'])
+        log.info(f"output of ls -a command:\n{ls_a_output}")
+
+        self.assertIn(self.hidden_dir_name, ls_a_output)
+
+    def test_ls_does_not_print_hidden_dir(self):
+        """ Test ls command does not print hidden directory """
+
+        self.run_cephfs_shell_cmd(f'mkdir {self.hidden_dir_name}')
+
+        ls_output = self.get_cephfs_shell_cmd_output("ls")
+        log.info(f"output of ls command:\n{ls_output}")
+
+        self.assertNotIn(self.hidden_dir_name, ls_output)
+
+    def test_ls_a_prints_non_hidden_dir(self):
+        """ Test ls -a command prints non hidden directory """
+
+        self.run_cephfs_shell_cmd(f'mkdir {self.hidden_dir_name} {self.dir_name}')
+
+        ls_a_output = self.get_cephfs_shell_cmd_output(['ls', '-a'])
+        log.info(f"output of ls -a command:\n{ls_a_output}")
+
+        self.assertIn(self.dir_name, ls_a_output)
+
+    def test_ls_H_prints_human_readable_file_size(self):
+        """ Test "ls -lH" prints human readable file size."""
+
+        file_sizes = ['1','1K', '1M', '1G']
+        file_names = ['dump1', 'dump2', 'dump3', 'dump4']
+
+
+        for (file_size, file_name) in zip(file_sizes, file_names):
+            temp_file = self.mount_a.client_remote.mktemp(file_name)
+            self.mount_a.run_shell_payload(f"fallocate -l {file_size} {temp_file}")
+            self.mount_a.run_shell_payload(f'mv {temp_file} ./')
+
+        ls_H_output = self.get_cephfs_shell_cmd_output(['ls', '-lH'])
+
+        ls_H_file_size = set()
+        for line in ls_H_output.split('\n'):
+            ls_H_file_size.add(line.split()[1])
+
+        # test that file sizes are in human readable format
+        self.assertEqual({'1B','1K', '1M', '1G'}, ls_H_file_size)
+
+    def test_ls_s_sort_by_size(self):
+        """ Test "ls -S" sorts file listing by file_size """
+        test_file1 = "test_file1.txt"
+        test_file2 = "test_file2.txt"
+        file1_content = 'A' * 102
+        file2_content = 'B' * 10
+
+        self.run_cephfs_shell_cmd(f"write {test_file1}", stdin=file1_content)
+        self.run_cephfs_shell_cmd(f"write {test_file2}", stdin=file2_content)
+
+        ls_s_output = self.get_cephfs_shell_cmd_output(['ls', '-lS'])
+
+        file_sizes = []
+        for line in ls_s_output.split('\n'):
+            file_sizes.append(line.split()[1])
+
+        #test that file size are in ascending order
+        self.assertEqual(file_sizes, sorted(file_sizes))
+
+
+class TestMisc(TestCephFSShell):
+    def test_issue_cephfs_shell_cmd_at_invocation(self):
+        """
+        Test that `cephfs-shell -c conf cmd` works.
+        """
+        # choosing a long name since short ones have a higher probability
+        # of getting matched by coincidence.
+        dirname = 'somedirectory'
+        self.run_cephfs_shell_cmd(['mkdir', dirname])
+
+        output = self.mount_a.client_remote.sh(['cephfs-shell', 'ls']).\
+            strip()
+
+        self.assertRegex(output, dirname)
+
+    def test_help(self):
+        """
+        Test that help outputs commands.
+        """
+        o = self.get_cephfs_shell_cmd_output("help all")
+        log.info("output:\n{}".format(o))
+
+class TestShellOpts(TestCephFSShell):
+    """
+    Contains tests for shell options from conf file and shell prompt.
+    """
+
+    def setUp(self):
+        super(type(self), self).setUp()
+
+        # output of following command -
+        # editor - was: 'vim'
+        # now: '?'
+        # editor: '?'
+        self.editor_val = self.get_cephfs_shell_cmd_output(
+            'set editor ?, set editor').split('\n')[2]
+        self.editor_val = self.editor_val.split(':')[1].\
+            replace("'", "", 2).strip()
+
+    def write_tempconf(self, confcontents):
+        self.tempconfpath = self.mount_a.client_remote.mktemp(
+            suffix='cephfs-shell.conf')
+        self.mount_a.client_remote.write_file(self.tempconfpath,
+                         confcontents)
+
+    def test_reading_conf(self):
+        self.write_tempconf("[cephfs-shell]\neditor =  ???")
+
+        # output of following command -
+        # CephFS:~/>>> set editor
+        # editor: 'vim'
+        final_editor_val = self.get_cephfs_shell_cmd_output(
+            cmd='set editor', shell_conf_path=self.tempconfpath)
+        final_editor_val = final_editor_val.split(': ')[1]
+        final_editor_val = final_editor_val.replace("'", "", 2)
+
+        self.assertNotEqual(self.editor_val, final_editor_val)
+
+    def test_reading_conf_with_dup_opt(self):
+        """
+        Read conf without duplicate sections/options.
+        """
+        self.write_tempconf("[cephfs-shell]\neditor = ???\neditor = " +
+                            self.editor_val)
+
+        # output of following command -
+        # CephFS:~/>>> set editor
+        # editor: 'vim'
+        final_editor_val = self.get_cephfs_shell_cmd_output(
+            cmd='set editor', shell_conf_path=self.tempconfpath)
+        final_editor_val = final_editor_val.split(': ')[1]
+        final_editor_val = final_editor_val.replace("'", "", 2)
+
+        self.assertEqual(self.editor_val, final_editor_val)
+
+    def test_setting_opt_after_reading_conf(self):
+        self.write_tempconf("[cephfs-shell]\neditor = ???")
+
+        # output of following command -
+        # editor - was: vim
+        # now: vim
+        # editor: vim
+        final_editor_val = self.get_cephfs_shell_cmd_output(
+            cmd='set editor %s, set editor' % (self.editor_val),
+            shell_conf_path=self.tempconfpath)
+        final_editor_val = final_editor_val.split('\n')[2]
+        final_editor_val = final_editor_val.split(': ')[1]
+        final_editor_val = final_editor_val.replace("'", "", 2)
+
+        self.assertEqual(self.editor_val, final_editor_val)
diff --git a/qa/tasks/cephfs/test_client_limits.py b/qa/tasks/cephfs/test_client_limits.py
new file mode 100644
index 000000000..4ea6576af
--- /dev/null
+++ b/qa/tasks/cephfs/test_client_limits.py
@@ -0,0 +1,327 @@
+
+"""
+Exercise the MDS's behaviour when clients and the MDCache reach or
+exceed the limits of how many caps/inodes they should hold.
+"""
+
+import logging
+from textwrap import dedent
+from tasks.ceph_test_case import TestTimeoutError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, needs_trimming
+from tasks.cephfs.fuse_mount import FuseMount
+import os
+
+
+log = logging.getLogger(__name__)
+
+
+# Arbitrary timeouts for operations involving restarting
+# an MDS or waiting for it to come up
+MDS_RESTART_GRACE = 60
+
+# Hardcoded values from Server::recall_client_state
+CAP_RECALL_RATIO = 0.8
+CAP_RECALL_MIN = 100
+
+
+class TestClientLimits(CephFSTestCase):
+    REQUIRE_KCLIENT_REMOTE = True
+    CLIENTS_REQUIRED = 2
+
+    def _test_client_pin(self, use_subdir, open_files):
+        """
+        When a client pins an inode in its cache, for example because the file is held open,
+        it should reject requests from the MDS to trim these caps.  The MDS should complain
+        to the user that it is unable to enforce its cache size limits because of this
+        objectionable client.
+
+        :param use_subdir: whether to put test files in a subdir or use root
+        """
+
+        # Set MDS cache memory limit to a low value that will make the MDS to
+        # ask the client to trim the caps.
+        cache_memory_limit = "1K"
+
+        self.config_set('mds', 'mds_cache_memory_limit', cache_memory_limit)
+        self.config_set('mds', 'mds_recall_max_caps', int(open_files/2))
+        self.config_set('mds', 'mds_recall_warning_threshold', open_files)
+
+        mds_min_caps_per_client = int(self.config_get('mds', "mds_min_caps_per_client"))
+        self.config_set('mds', 'mds_min_caps_working_set', mds_min_caps_per_client)
+        mds_max_caps_per_client = int(self.config_get('mds', "mds_max_caps_per_client"))
+        mds_recall_warning_decay_rate = float(self.config_get('mds', "mds_recall_warning_decay_rate"))
+        self.assertGreaterEqual(open_files, mds_min_caps_per_client)
+
+        mount_a_client_id = self.mount_a.get_global_id()
+        path = "subdir" if use_subdir else "."
+        open_proc = self.mount_a.open_n_background(path, open_files)
+
+        # Client should now hold:
+        # `open_files` caps for the open files
+        # 1 cap for root
+        # 1 cap for subdir
+        self.wait_until_equal(lambda: self.get_session(mount_a_client_id)['num_caps'],
+                              open_files + (2 if use_subdir else 1),
+                              timeout=600,
+                              reject_fn=lambda x: x > open_files + 2)
+
+        # MDS should not be happy about that, as the client is failing to comply
+        # with the SESSION_RECALL messages it is being sent
+        self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_warning_decay_rate*2)
+
+        # We can also test that the MDS health warning for oversized
+        # cache is functioning as intended.
+        self.wait_for_health("MDS_CACHE_OVERSIZED", mds_recall_warning_decay_rate*2)
+
+        # When the client closes the files, it should retain only as many caps as allowed
+        # under the SESSION_RECALL policy
+        log.info("Terminating process holding files open")
+        self.mount_a._kill_background(open_proc)
+
+        # The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message,
+        # which depend on the caps outstanding, cache size and overall ratio
+        def expected_caps():
+            num_caps = self.get_session(mount_a_client_id)['num_caps']
+            if num_caps <= mds_min_caps_per_client:
+                return True
+            elif num_caps <= mds_max_caps_per_client:
+                return True
+            else:
+                return False
+
+        self.wait_until_true(expected_caps, timeout=60)
+
+    @needs_trimming
+    def test_client_pin_root(self):
+        self._test_client_pin(False, 400)
+
+    @needs_trimming
+    def test_client_pin(self):
+        self._test_client_pin(True, 800)
+
+    @needs_trimming
+    def test_client_pin_mincaps(self):
+        self._test_client_pin(True, 200)
+
+    def test_client_min_caps_working_set(self):
+        """
+        When a client has inodes pinned in its cache (open files), that the MDS
+        will not warn about the client not responding to cache pressure when
+        the number of caps is below mds_min_caps_working_set.
+        """
+
+        # Set MDS cache memory limit to a low value that will make the MDS to
+        # ask the client to trim the caps.
+        cache_memory_limit = "1K"
+        open_files = 400
+
+        self.config_set('mds', 'mds_cache_memory_limit', cache_memory_limit)
+        self.config_set('mds', 'mds_recall_max_caps', int(open_files/2))
+        self.config_set('mds', 'mds_recall_warning_threshold', open_files)
+        self.config_set('mds', 'mds_min_caps_working_set', open_files*2)
+
+        mds_min_caps_per_client = int(self.config_get('mds', "mds_min_caps_per_client"))
+        mds_recall_warning_decay_rate = float(self.config_get('mds', "mds_recall_warning_decay_rate"))
+        self.assertGreaterEqual(open_files, mds_min_caps_per_client)
+
+        mount_a_client_id = self.mount_a.get_global_id()
+        self.mount_a.open_n_background("subdir", open_files)
+
+        # Client should now hold:
+        # `open_files` caps for the open files
+        # 1 cap for root
+        # 1 cap for subdir
+        self.wait_until_equal(lambda: self.get_session(mount_a_client_id)['num_caps'],
+                              open_files + 2,
+                              timeout=600,
+                              reject_fn=lambda x: x > open_files + 2)
+
+        # We can also test that the MDS health warning for oversized
+        # cache is functioning as intended.
+        self.wait_for_health("MDS_CACHE_OVERSIZED", mds_recall_warning_decay_rate*2)
+
+        try:
+            # MDS should not be happy about that but it's not sending
+            # MDS_CLIENT_RECALL warnings because the client's caps are below
+            # mds_min_caps_working_set.
+            self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_warning_decay_rate*2)
+        except TestTimeoutError:
+            pass
+        else:
+            raise RuntimeError("expected no client recall warning")
+
+    def test_cap_acquisition_throttle_readdir(self):
+        """
+        Mostly readdir acquires caps faster than the mds recalls, so the cap
+        acquisition via readdir is throttled by retrying the readdir after
+        a fraction of second (0.5) by default when throttling condition is met.
+        """
+
+        max_caps_per_client = 500
+        cap_acquisition_throttle = 250
+
+        self.config_set('mds', 'mds_max_caps_per_client', max_caps_per_client)
+        self.config_set('mds', 'mds_session_cap_acquisition_throttle', cap_acquisition_throttle)
+
+        # Create 1500 files split across 6 directories, 250 each.
+        for i in range(1, 7):
+            self.mount_a.create_n_files("dir{0}/file".format(i), cap_acquisition_throttle, sync=True)
+
+        mount_a_client_id = self.mount_a.get_global_id()
+
+        # recursive readdir
+        self.mount_a.run_shell_payload("find | wc")
+
+        # validate cap_acquisition decay counter after readdir to exceed throttle count i.e 250
+        cap_acquisition_value = self.get_session(mount_a_client_id)['cap_acquisition']['value']
+        self.assertGreaterEqual(cap_acquisition_value, cap_acquisition_throttle)
+
+        # validate the throttle condition to be hit atleast once
+        cap_acquisition_throttle_hit_count = self.perf_dump()['mds_server']['cap_acquisition_throttle']
+        self.assertGreaterEqual(cap_acquisition_throttle_hit_count, 1)
+
+    def test_client_release_bug(self):
+        """
+        When a client has a bug (which we will simulate) preventing it from releasing caps,
+        the MDS should notice that releases are not being sent promptly, and generate a health
+        metric to that effect.
+        """
+
+        # The debug hook to inject the failure only exists in the fuse client
+        if not isinstance(self.mount_a, FuseMount):
+            self.skipTest("Require FUSE client to inject client release failure")
+
+        self.set_conf('client.{0}'.format(self.mount_a.client_id), 'client inject release failure', 'true')
+        self.mount_a.teardown()
+        self.mount_a.mount_wait()
+        mount_a_client_id = self.mount_a.get_global_id()
+
+        # Client A creates a file.  He will hold the write caps on the file, and later (simulated bug) fail
+        # to comply with the MDSs request to release that cap
+        self.mount_a.run_shell(["touch", "file1"])
+
+        # Client B tries to stat the file that client A created
+        rproc = self.mount_b.write_background("file1")
+
+        # After session_timeout, we should see a health warning (extra lag from
+        # MDS beacon period)
+        session_timeout = self.fs.get_var("session_timeout")
+        self.wait_for_health("MDS_CLIENT_LATE_RELEASE", session_timeout + 10)
+
+        # Client B should still be stuck
+        self.assertFalse(rproc.finished)
+
+        # Kill client A
+        self.mount_a.kill()
+        self.mount_a.kill_cleanup()
+
+        # Client B should complete
+        self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+        rproc.wait()
+
+    def test_client_oldest_tid(self):
+        """
+        When a client does not advance its oldest tid, the MDS should notice that
+        and generate health warnings.
+        """
+
+        # num of requests client issues
+        max_requests = 1000
+
+        # The debug hook to inject the failure only exists in the fuse client
+        if not isinstance(self.mount_a, FuseMount):
+            self.skipTest("Require FUSE client to inject client release failure")
+
+        self.set_conf('client', 'client inject fixed oldest tid', 'true')
+        self.mount_a.teardown()
+        self.mount_a.mount_wait()
+
+        self.fs.mds_asok(['config', 'set', 'mds_max_completed_requests', '{0}'.format(max_requests)])
+
+        # Create lots of files
+        self.mount_a.create_n_files("testdir/file1", max_requests + 100)
+
+        # Create a few files synchronously. This makes sure previous requests are completed
+        self.mount_a.create_n_files("testdir/file2", 5, True)
+
+        # Wait for the health warnings. Assume mds can handle 10 request per second at least
+        self.wait_for_health("MDS_CLIENT_OLDEST_TID", max_requests // 10)
+
+    def _test_client_cache_size(self, mount_subdir):
+        """
+        check if client invalidate kernel dcache according to its cache size config
+        """
+
+        # The debug hook to inject the failure only exists in the fuse client
+        if not isinstance(self.mount_a, FuseMount):
+            self.skipTest("Require FUSE client to inject client release failure")
+
+        if mount_subdir:
+            # fuse assigns a fix inode number (1) to root inode. But in mounting into
+            # subdir case, the actual inode number of root is not 1. This mismatch
+            # confuses fuse_lowlevel_notify_inval_entry() when invalidating dentries
+            # in root directory.
+            self.mount_a.run_shell(["mkdir", "subdir"])
+            self.mount_a.umount_wait()
+            self.set_conf('client', 'client mountpoint', '/subdir')
+            self.mount_a.mount_wait()
+            root_ino = self.mount_a.path_to_ino(".")
+            self.assertEqual(root_ino, 1);
+
+        dir_path = os.path.join(self.mount_a.mountpoint, "testdir")
+
+        mkdir_script = dedent("""
+            import os
+            os.mkdir("{path}")
+            for n in range(0, {num_dirs}):
+                os.mkdir("{path}/dir{{0}}".format(n))
+            """)
+
+        num_dirs = 1000
+        self.mount_a.run_python(mkdir_script.format(path=dir_path, num_dirs=num_dirs))
+        self.mount_a.run_shell(["sync"])
+
+        dentry_count, dentry_pinned_count = self.mount_a.get_dentry_count()
+        self.assertGreaterEqual(dentry_count, num_dirs)
+        self.assertGreaterEqual(dentry_pinned_count, num_dirs)
+
+        cache_size = num_dirs // 10
+        self.mount_a.set_cache_size(cache_size)
+
+        def trimmed():
+            dentry_count, dentry_pinned_count = self.mount_a.get_dentry_count()
+            log.info("waiting, dentry_count, dentry_pinned_count: {0}, {1}".format(
+                dentry_count, dentry_pinned_count
+            ))
+            if dentry_count > cache_size or dentry_pinned_count > cache_size:
+                return False
+
+            return True
+
+        self.wait_until_true(trimmed, 30)
+
+    @needs_trimming
+    def test_client_cache_size(self):
+        self._test_client_cache_size(False)
+        self._test_client_cache_size(True)
+
+    def test_client_max_caps(self):
+        """
+        That the MDS will not let a client sit above mds_max_caps_per_client caps.
+        """
+
+        mds_min_caps_per_client = int(self.config_get('mds', "mds_min_caps_per_client"))
+        mds_max_caps_per_client = 2*mds_min_caps_per_client
+        self.config_set('mds', 'mds_max_caps_per_client', mds_max_caps_per_client)
+
+        self.mount_a.create_n_files("foo/", 3*mds_max_caps_per_client, sync=True)
+
+        mount_a_client_id = self.mount_a.get_global_id()
+        def expected_caps():
+            num_caps = self.get_session(mount_a_client_id)['num_caps']
+            if num_caps <= mds_max_caps_per_client:
+                return True
+            else:
+                return False
+
+        self.wait_until_true(expected_caps, timeout=60)
diff --git a/qa/tasks/cephfs/test_client_recovery.py b/qa/tasks/cephfs/test_client_recovery.py
new file mode 100644
index 000000000..24726b369
--- /dev/null
+++ b/qa/tasks/cephfs/test_client_recovery.py
@@ -0,0 +1,698 @@
+
+"""
+Teuthology task for exercising CephFS client recovery
+"""
+
+import logging
+from textwrap import dedent
+import time
+import distutils.version as version
+import re
+import os
+
+from teuthology.orchestra import run
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.packaging import get_package_version
+
+log = logging.getLogger(__name__)
+
+
+# Arbitrary timeouts for operations involving restarting
+# an MDS or waiting for it to come up
+MDS_RESTART_GRACE = 60
+
+
+class TestClientNetworkRecovery(CephFSTestCase):
+    REQUIRE_KCLIENT_REMOTE = True
+    REQUIRE_ONE_CLIENT_REMOTE = True
+    CLIENTS_REQUIRED = 2
+
+    LOAD_SETTINGS = ["mds_reconnect_timeout", "ms_max_backoff"]
+
+    # Environment references
+    mds_reconnect_timeout = None
+    ms_max_backoff = None
+
+    def test_network_death(self):
+        """
+        Simulate software freeze or temporary network failure.
+
+        Check that the client blocks I/O during failure, and completes
+        I/O after failure.
+        """
+
+        session_timeout = self.fs.get_var("session_timeout")
+        self.fs.mds_asok(['config', 'set', 'mds_defer_session_stale', 'false'])
+
+        # We only need one client
+        self.mount_b.umount_wait()
+
+        # Initially our one client session should be visible
+        client_id = self.mount_a.get_global_id()
+        ls_data = self._session_list()
+        self.assert_session_count(1, ls_data)
+        self.assertEqual(ls_data[0]['id'], client_id)
+        self.assert_session_state(client_id, "open")
+
+        # ...and capable of doing I/O without blocking
+        self.mount_a.create_files()
+
+        # ...but if we turn off the network
+        self.fs.set_clients_block(True)
+
+        # ...and try and start an I/O
+        write_blocked = self.mount_a.write_background()
+
+        # ...then it should block
+        self.assertFalse(write_blocked.finished)
+        self.assert_session_state(client_id, "open")
+        time.sleep(session_timeout * 1.5)  # Long enough for MDS to consider session stale
+        self.assertFalse(write_blocked.finished)
+        self.assert_session_state(client_id, "stale")
+
+        # ...until we re-enable I/O
+        self.fs.set_clients_block(False)
+
+        # ...when it should complete promptly
+        a = time.time()
+        self.wait_until_true(lambda: write_blocked.finished, self.ms_max_backoff * 2)
+        write_blocked.wait()  # Already know we're finished, wait() to raise exception on errors
+        recovery_time = time.time() - a
+        log.info("recovery time: {0}".format(recovery_time))
+        self.assert_session_state(client_id, "open")
+
+
+class TestClientRecovery(CephFSTestCase):
+    REQUIRE_KCLIENT_REMOTE = True
+    CLIENTS_REQUIRED = 2
+
+    LOAD_SETTINGS = ["mds_reconnect_timeout", "ms_max_backoff"]
+
+    # Environment references
+    mds_reconnect_timeout = None
+    ms_max_backoff = None
+
+    def test_basic(self):
+        # Check that two clients come up healthy and see each others' files
+        # =====================================================
+        self.mount_a.create_files()
+        self.mount_a.check_files()
+        self.mount_a.umount_wait()
+
+        self.mount_b.check_files()
+
+        self.mount_a.mount_wait()
+
+        # Check that the admin socket interface is correctly reporting
+        # two sessions
+        # =====================================================
+        ls_data = self._session_list()
+        self.assert_session_count(2, ls_data)
+
+        self.assertSetEqual(
+            set([l['id'] for l in ls_data]),
+            {self.mount_a.get_global_id(), self.mount_b.get_global_id()}
+        )
+
+    def test_restart(self):
+        # Check that after an MDS restart both clients reconnect and continue
+        # to handle I/O
+        # =====================================================
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
+
+        self.mount_a.create_destroy()
+        self.mount_b.create_destroy()
+
+    def _session_num_caps(self, client_id):
+        ls_data = self.fs.mds_asok(['session', 'ls'])
+        return int(self._session_by_id(ls_data).get(client_id, {'num_caps': None})['num_caps'])
+
+    def test_reconnect_timeout(self):
+        # Reconnect timeout
+        # =================
+        # Check that if I stop an MDS and a client goes away, the MDS waits
+        # for the reconnect period
+
+        mount_a_client_id = self.mount_a.get_global_id()
+
+        self.fs.fail()
+
+        self.mount_a.umount_wait(force=True)
+
+        self.fs.set_joinable()
+
+        self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE)
+        # Check that the MDS locally reports its state correctly
+        status = self.fs.mds_asok(['status'])
+        self.assertIn("reconnect_status", status)
+
+        ls_data = self._session_list()
+        self.assert_session_count(2, ls_data)
+
+        # The session for the dead client should have the 'reconnect' flag set
+        self.assertTrue(self.get_session(mount_a_client_id)['reconnecting'])
+
+        # Wait for the reconnect state to clear, this should take the
+        # reconnect timeout period.
+        in_reconnect_for = self.fs.wait_for_state('up:active', timeout=self.mds_reconnect_timeout * 2)
+        # Check that the period we waited to enter active is within a factor
+        # of two of the reconnect timeout.
+        self.assertGreater(in_reconnect_for, self.mds_reconnect_timeout // 2,
+                           "Should have been in reconnect phase for {0} but only took {1}".format(
+                               self.mds_reconnect_timeout, in_reconnect_for
+                           ))
+
+        self.assert_session_count(1)
+
+        # Check that the client that timed out during reconnect can
+        # mount again and do I/O
+        self.mount_a.mount_wait()
+        self.mount_a.create_destroy()
+
+        self.assert_session_count(2)
+
+    def test_reconnect_eviction(self):
+        # Eviction during reconnect
+        # =========================
+        mount_a_client_id = self.mount_a.get_global_id()
+
+        self.fs.fail()
+
+        # The mount goes away while the MDS is offline
+        self.mount_a.kill()
+
+        # wait for it to die
+        time.sleep(5)
+
+        self.fs.set_joinable()
+
+        # Enter reconnect phase
+        self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE)
+        self.assert_session_count(2)
+
+        # Evict the stuck client
+        self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+        self.assert_session_count(1)
+
+        # Observe that we proceed to active phase without waiting full reconnect timeout
+        evict_til_active = self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
+        # Once we evict the troublemaker, the reconnect phase should complete
+        # in well under the reconnect timeout.
+        self.assertLess(evict_til_active, self.mds_reconnect_timeout * 0.5,
+                        "reconnect did not complete soon enough after eviction, took {0}".format(
+                            evict_til_active
+                        ))
+
+        # We killed earlier so must clean up before trying to use again
+        self.mount_a.kill_cleanup()
+
+        # Bring the client back
+        self.mount_a.mount_wait()
+        self.mount_a.create_destroy()
+
+    def _test_stale_caps(self, write):
+        session_timeout = self.fs.get_var("session_timeout")
+
+        # Capability release from stale session
+        # =====================================
+        if write:
+            cap_holder = self.mount_a.open_background()
+        else:
+            self.mount_a.run_shell(["touch", "background_file"])
+            self.mount_a.umount_wait()
+            self.mount_a.mount_wait()
+            cap_holder = self.mount_a.open_background(write=False)
+
+        self.assert_session_count(2)
+        mount_a_gid = self.mount_a.get_global_id()
+
+        # Wait for the file to be visible from another client, indicating
+        # that mount_a has completed its network ops
+        self.mount_b.wait_for_visible()
+
+        # Simulate client death
+        self.mount_a.suspend_netns()
+
+        # wait for it to die so it doesn't voluntarily release buffer cap
+        time.sleep(5)
+
+        try:
+            # Now, after session_timeout seconds, the waiter should
+            # complete their operation when the MDS marks the holder's
+            # session stale.
+            cap_waiter = self.mount_b.write_background()
+            a = time.time()
+            cap_waiter.wait()
+            b = time.time()
+
+            # Should have succeeded
+            self.assertEqual(cap_waiter.exitstatus, 0)
+
+            if write:
+                self.assert_session_count(1)
+            else:
+                self.assert_session_state(mount_a_gid, "stale")
+
+            cap_waited = b - a
+            log.info("cap_waiter waited {0}s".format(cap_waited))
+            self.assertTrue(session_timeout / 2.0 <= cap_waited <= session_timeout * 2.0,
+                            "Capability handover took {0}, expected approx {1}".format(
+                                cap_waited, session_timeout
+                            ))
+
+            self.mount_a._kill_background(cap_holder)
+        finally:
+            # teardown() doesn't quite handle this case cleanly, so help it out
+            self.mount_a.resume_netns()
+
+    def test_stale_read_caps(self):
+        self._test_stale_caps(False)
+
+    def test_stale_write_caps(self):
+        self._test_stale_caps(True)
+
+    def test_evicted_caps(self):
+        # Eviction while holding a capability
+        # ===================================
+
+        session_timeout = self.fs.get_var("session_timeout")
+
+        # Take out a write capability on a file on client A,
+        # and then immediately kill it.
+        cap_holder = self.mount_a.open_background()
+        mount_a_client_id = self.mount_a.get_global_id()
+
+        # Wait for the file to be visible from another client, indicating
+        # that mount_a has completed its network ops
+        self.mount_b.wait_for_visible()
+
+        # Simulate client death
+        self.mount_a.suspend_netns()
+
+        # wait for it to die so it doesn't voluntarily release buffer cap
+        time.sleep(5)
+
+        try:
+            # The waiter should get stuck waiting for the capability
+            # held on the MDS by the now-dead client A
+            cap_waiter = self.mount_b.write_background()
+            time.sleep(5)
+            self.assertFalse(cap_waiter.finished)
+
+            self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+            # Now, because I evicted the old holder of the capability, it should
+            # immediately get handed over to the waiter
+            a = time.time()
+            cap_waiter.wait()
+            b = time.time()
+            cap_waited = b - a
+            log.info("cap_waiter waited {0}s".format(cap_waited))
+            # This is the check that it happened 'now' rather than waiting
+            # for the session timeout
+            self.assertLess(cap_waited, session_timeout / 2.0,
+                            "Capability handover took {0}, expected less than {1}".format(
+                                cap_waited, session_timeout / 2.0
+                            ))
+
+            self.mount_a._kill_background(cap_holder)
+        finally:
+            self.mount_a.resume_netns()
+
+    def test_trim_caps(self):
+        # Trim capability when reconnecting MDS
+        # ===================================
+
+        count = 500
+        # Create lots of files
+        for i in range(count):
+            self.mount_a.run_shell(["touch", "f{0}".format(i)])
+
+        # Populate mount_b's cache
+        self.mount_b.run_shell(["ls", "-l"])
+
+        client_id = self.mount_b.get_global_id()
+        num_caps = self._session_num_caps(client_id)
+        self.assertGreaterEqual(num_caps, count)
+
+        # Restart MDS. client should trim its cache when reconnecting to the MDS
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
+
+        num_caps = self._session_num_caps(client_id)
+        self.assertLess(num_caps, count,
+                        "should have less than {0} capabilities, have {1}".format(
+                            count, num_caps
+                        ))
+
+    def _is_flockable(self):
+        a_version_str = get_package_version(self.mount_a.client_remote, "fuse")
+        b_version_str = get_package_version(self.mount_b.client_remote, "fuse")
+        flock_version_str = "2.9"
+
+        version_regex = re.compile(r"[0-9\.]+")
+        a_result = version_regex.match(a_version_str)
+        self.assertTrue(a_result)
+        b_result = version_regex.match(b_version_str)
+        self.assertTrue(b_result)
+        a_version = version.StrictVersion(a_result.group())
+        b_version = version.StrictVersion(b_result.group())
+        flock_version=version.StrictVersion(flock_version_str)
+
+        if (a_version >= flock_version and b_version >= flock_version):
+            log.info("flock locks are available")
+            return True
+        else:
+            log.info("not testing flock locks, machines have versions {av} and {bv}".format(
+                av=a_version_str,bv=b_version_str))
+            return False
+
+    def test_filelock(self):
+        """
+        Check that file lock doesn't get lost after an MDS restart
+        """
+
+        flockable = self._is_flockable()
+        lock_holder = self.mount_a.lock_background(do_flock=flockable)
+
+        self.mount_b.wait_for_visible("background_file-2")
+        self.mount_b.check_filelock(do_flock=flockable)
+
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
+
+        self.mount_b.check_filelock(do_flock=flockable)
+
+        # Tear down the background process
+        self.mount_a._kill_background(lock_holder)
+
+    def test_filelock_eviction(self):
+        """
+        Check that file lock held by evicted client is given to
+        waiting client.
+        """
+        if not self._is_flockable():
+            self.skipTest("flock is not available")
+
+        lock_holder = self.mount_a.lock_background()
+        self.mount_b.wait_for_visible("background_file-2")
+        self.mount_b.check_filelock()
+
+        lock_taker = self.mount_b.lock_and_release()
+        # Check the taker is waiting (doesn't get it immediately)
+        time.sleep(2)
+        self.assertFalse(lock_holder.finished)
+        self.assertFalse(lock_taker.finished)
+
+        try:
+            mount_a_client_id = self.mount_a.get_global_id()
+            self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+
+            # Evicting mount_a should let mount_b's attempt to take the lock
+            # succeed
+            self.wait_until_true(lambda: lock_taker.finished, timeout=10)
+        finally:
+            # Tear down the background process
+            self.mount_a._kill_background(lock_holder)
+
+            # teardown() doesn't quite handle this case cleanly, so help it out
+            self.mount_a.kill()
+            self.mount_a.kill_cleanup()
+
+        # Bring the client back
+        self.mount_a.mount_wait()
+
+    def test_dir_fsync(self):
+        self._test_fsync(True);
+
+    def test_create_fsync(self):
+        self._test_fsync(False);
+
+    def _test_fsync(self, dirfsync):
+        """
+        That calls to fsync guarantee visibility of metadata to another
+        client immediately after the fsyncing client dies.
+        """
+
+        # Leave this guy out until he's needed
+        self.mount_b.umount_wait()
+
+        # Create dir + child dentry on client A, and fsync the dir
+        path = os.path.join(self.mount_a.mountpoint, "subdir")
+        self.mount_a.run_python(
+            dedent("""
+                import os
+                import time
+
+                path = "{path}"
+
+                print("Starting creation...")
+                start = time.time()
+
+                os.mkdir(path)
+                dfd = os.open(path, os.O_DIRECTORY)
+
+                fd = open(os.path.join(path, "childfile"), "w")
+                print("Finished creation in {{0}}s".format(time.time() - start))
+
+                print("Starting fsync...")
+                start = time.time()
+                if {dirfsync}:
+                    os.fsync(dfd)
+                else:
+                    os.fsync(fd)
+                print("Finished fsync in {{0}}s".format(time.time() - start))
+            """.format(path=path,dirfsync=str(dirfsync)))
+        )
+
+        # Immediately kill the MDS and then client A
+        self.fs.fail()
+        self.mount_a.kill()
+        self.mount_a.kill_cleanup()
+
+        # Restart the MDS.  Wait for it to come up, it'll have to time out in clientreplay
+        self.fs.set_joinable()
+        log.info("Waiting for reconnect...")
+        self.fs.wait_for_state("up:reconnect")
+        log.info("Waiting for active...")
+        self.fs.wait_for_state("up:active", timeout=MDS_RESTART_GRACE + self.mds_reconnect_timeout)
+        log.info("Reached active...")
+
+        # Is the child dentry visible from mount B?
+        self.mount_b.mount_wait()
+        self.mount_b.run_shell(["ls", "subdir/childfile"])
+
+    def test_unmount_for_evicted_client(self):
+        """Test if client hangs on unmount after evicting the client."""
+        mount_a_client_id = self.mount_a.get_global_id()
+        self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+
+        self.mount_a.umount_wait(require_clean=True, timeout=30)
+
+    def test_stale_renew(self):
+        if not isinstance(self.mount_a, FuseMount):
+            self.skipTest("Require FUSE client to handle signal STOP/CONT")
+
+        session_timeout = self.fs.get_var("session_timeout")
+
+        self.mount_a.run_shell(["mkdir", "testdir"])
+        self.mount_a.run_shell(["touch", "testdir/file1"])
+        # populate readdir cache
+        self.mount_a.run_shell(["ls", "testdir"])
+        self.mount_b.run_shell(["ls", "testdir"])
+
+        # check if readdir cache is effective
+        initial_readdirs = self.fs.mds_asok(['perf', 'dump', 'mds_server', 'req_readdir_latency'])
+        self.mount_b.run_shell(["ls", "testdir"])
+        current_readdirs = self.fs.mds_asok(['perf', 'dump', 'mds_server', 'req_readdir_latency'])
+        self.assertEqual(current_readdirs, initial_readdirs);
+
+        mount_b_gid = self.mount_b.get_global_id()
+        # stop ceph-fuse process of mount_b
+        self.mount_b.suspend_netns()
+
+        self.assert_session_state(mount_b_gid, "open")
+        time.sleep(session_timeout * 1.5)  # Long enough for MDS to consider session stale
+
+        self.mount_a.run_shell(["touch", "testdir/file2"])
+        self.assert_session_state(mount_b_gid, "stale")
+
+        # resume ceph-fuse process of mount_b
+        self.mount_b.resume_netns()
+        # Is the new file visible from mount_b? (caps become invalid after session stale)
+        self.mount_b.run_shell(["ls", "testdir/file2"])
+
+    def test_abort_conn(self):
+        """
+        Check that abort_conn() skips closing mds sessions.
+        """
+        if not isinstance(self.mount_a, FuseMount):
+            self.skipTest("Testing libcephfs function")
+
+        self.fs.mds_asok(['config', 'set', 'mds_defer_session_stale', 'false'])
+        session_timeout = self.fs.get_var("session_timeout")
+
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        gid_str = self.mount_a.run_python(dedent("""
+            import cephfs as libcephfs
+            cephfs = libcephfs.LibCephFS(conffile='')
+            cephfs.mount()
+            client_id = cephfs.get_instance_id()
+            cephfs.abort_conn()
+            print(client_id)
+            """)
+        )
+        gid = int(gid_str);
+
+        self.assert_session_state(gid, "open")
+        time.sleep(session_timeout * 1.5)  # Long enough for MDS to consider session stale
+        self.assert_session_state(gid, "stale")
+
+    def test_dont_mark_unresponsive_client_stale(self):
+        """
+        Test that an unresponsive client holding caps is not marked stale or
+        evicted unless another clients wants its caps.
+        """
+        if not isinstance(self.mount_a, FuseMount):
+            self.skipTest("Require FUSE client to handle signal STOP/CONT")
+
+        # XXX: To conduct this test we need at least two clients since a
+        # single client is never evcited by MDS.
+        SESSION_TIMEOUT = 30
+        SESSION_AUTOCLOSE = 50
+        time_at_beg = time.time()
+        mount_a_gid = self.mount_a.get_global_id()
+        _ = self.mount_a.client_pid
+        self.fs.set_var('session_timeout', SESSION_TIMEOUT)
+        self.fs.set_var('session_autoclose', SESSION_AUTOCLOSE)
+        self.assert_session_count(2, self.fs.mds_asok(['session', 'ls']))
+
+        # test that client holding cap not required by any other client is not
+        # marked stale when it becomes unresponsive.
+        self.mount_a.run_shell(['mkdir', 'dir'])
+        self.mount_a.send_signal('sigstop')
+        time.sleep(SESSION_TIMEOUT + 2)
+        self.assert_session_state(mount_a_gid, "open")
+
+        # test that other clients have to wait to get the caps from
+        # unresponsive client until session_autoclose.
+        self.mount_b.run_shell(['stat', 'dir'])
+        self.assert_session_count(1, self.fs.mds_asok(['session', 'ls']))
+        self.assertLess(time.time(), time_at_beg + SESSION_AUTOCLOSE)
+
+        self.mount_a.send_signal('sigcont')
+
+    def test_config_session_timeout(self):
+        self.fs.mds_asok(['config', 'set', 'mds_defer_session_stale', 'false'])
+        session_timeout = self.fs.get_var("session_timeout")
+        mount_a_gid = self.mount_a.get_global_id()
+
+        self.fs.mds_asok(['session', 'config', '%s' % mount_a_gid, 'timeout', '%s' % (session_timeout * 2)])
+
+        self.mount_a.kill();
+
+        self.assert_session_count(2)
+
+        time.sleep(session_timeout * 1.5)
+        self.assert_session_state(mount_a_gid, "open")
+
+        time.sleep(session_timeout)
+        self.assert_session_count(1)
+
+        self.mount_a.kill_cleanup()
+
+    def test_reconnect_after_blocklisted(self):
+        """
+        Test reconnect after blocklisted.
+        - writing to a fd that was opened before blocklist should return -EBADF
+        - reading/writing to a file with lost file locks should return -EIO
+        - readonly fd should continue to work
+        """
+
+        self.mount_a.umount_wait()
+
+        if isinstance(self.mount_a, FuseMount):
+            self.mount_a.mount_wait(mntopts=['--client_reconnect_stale=1', '--fuse_disable_pagecache=1'])
+        else:
+            try:
+                self.mount_a.mount_wait(mntopts=['recover_session=clean'])
+            except CommandFailedError:
+                self.mount_a.kill_cleanup()
+                self.skipTest("Not implemented in current kernel")
+
+        self.mount_a.wait_until_mounted()
+
+        path = os.path.join(self.mount_a.mountpoint, 'testfile_reconnect_after_blocklisted')
+        pyscript = dedent("""
+            import os
+            import sys
+            import fcntl
+            import errno
+            import time
+
+            fd1 = os.open("{path}.1", os.O_RDWR | os.O_CREAT, 0O666)
+            fd2 = os.open("{path}.1", os.O_RDONLY)
+            fd3 = os.open("{path}.2", os.O_RDWR | os.O_CREAT, 0O666)
+            fd4 = os.open("{path}.2", os.O_RDONLY)
+
+            os.write(fd1, b'content')
+            os.read(fd2, 1);
+
+            os.write(fd3, b'content')
+            os.read(fd4, 1);
+            fcntl.flock(fd4, fcntl.LOCK_SH | fcntl.LOCK_NB)
+
+            print("blocklist")
+            sys.stdout.flush()
+
+            sys.stdin.readline()
+
+            # wait for mds to close session
+            time.sleep(10);
+
+            # trigger 'open session' message. kclient relies on 'session reject' message
+            # to detect if itself is blocklisted
+            try:
+                os.stat("{path}.1")
+            except:
+                pass
+
+            # wait for auto reconnect
+            time.sleep(10);
+
+            try:
+                os.write(fd1, b'content')
+            except OSError as e:
+                if e.errno != errno.EBADF:
+                    raise
+            else:
+                raise RuntimeError("write() failed to raise error")
+
+            os.read(fd2, 1);
+
+            try:
+                os.read(fd4, 1)
+            except OSError as e:
+                if e.errno != errno.EIO:
+                    raise
+            else:
+                raise RuntimeError("read() failed to raise error")
+            """).format(path=path)
+        rproc = self.mount_a.client_remote.run(
+                    args=['python3', '-c', pyscript],
+                    wait=False, stdin=run.PIPE, stdout=run.PIPE)
+
+        rproc.stdout.readline()
+
+        mount_a_client_id = self.mount_a.get_global_id()
+        self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+
+        rproc.stdin.writelines(['done\n'])
+        rproc.stdin.flush()
+
+        rproc.wait()
+        self.assertEqual(rproc.exitstatus, 0)
diff --git a/qa/tasks/cephfs/test_damage.py b/qa/tasks/cephfs/test_damage.py
new file mode 100644
index 000000000..4d0194429
--- /dev/null
+++ b/qa/tasks/cephfs/test_damage.py
@@ -0,0 +1,564 @@
+from io import BytesIO, StringIO
+import json
+import logging
+import errno
+import re
+from teuthology.contextutil import MaxWhileTries
+from teuthology.exceptions import CommandFailedError
+from teuthology.orchestra.run import wait
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+
+DAMAGED_ON_START = "damaged_on_start"
+DAMAGED_ON_LS = "damaged_on_ls"
+CRASHED = "server crashed"
+NO_DAMAGE = "no damage"
+READONLY = "readonly"
+FAILED_CLIENT = "client failed"
+FAILED_SERVER = "server failed"
+
+# An EIO in response to a stat from the client
+EIO_ON_LS = "eio"
+
+# An EIO, but nothing in damage table (not ever what we expect)
+EIO_NO_DAMAGE = "eio without damage entry"
+
+
+log = logging.getLogger(__name__)
+
+
+class TestDamage(CephFSTestCase):
+    def _simple_workload_write(self):
+        self.mount_a.run_shell(["mkdir", "subdir"])
+        self.mount_a.write_n_mb("subdir/sixmegs", 6)
+        return self.mount_a.stat("subdir/sixmegs")
+
+    def is_marked_damaged(self, rank):
+        mds_map = self.fs.get_mds_map()
+        return rank in mds_map['damaged']
+
+    @for_teuthology #459s
+    def test_object_deletion(self):
+        """
+        That the MDS has a clean 'damaged' response to loss of any single metadata object
+        """
+
+        self._simple_workload_write()
+
+        # Hmm, actually it would be nice to permute whether the metadata pool
+        # state contains sessions or not, but for the moment close this session
+        # to avoid waiting through reconnect on every MDS start.
+        self.mount_a.umount_wait()
+        for mds_name in self.fs.get_active_names():
+            self.fs.mds_asok(["flush", "journal"], mds_name)
+
+        self.fs.fail()
+
+        serialized = self.fs.radosmo(['export', '-'])
+
+        def is_ignored(obj_id, dentry=None):
+            """
+            A filter to avoid redundantly mutating many similar objects (e.g.
+            stray dirfrags) or similar dentries (e.g. stray dir dentries)
+            """
+            if re.match("60.\.00000000", obj_id) and obj_id != "600.00000000":
+                return True
+
+            if dentry and obj_id == "100.00000000":
+                if re.match("stray.+_head", dentry) and dentry != "stray0_head":
+                    return True
+
+            return False
+
+        def get_path(obj_id, dentry=None):
+            """
+            What filesystem path does this object or dentry correspond to?   i.e.
+            what should I poke to see EIO after damaging it?
+            """
+
+            if obj_id == "1.00000000" and dentry == "subdir_head":
+                return "./subdir"
+            elif obj_id == "10000000000.00000000" and dentry == "sixmegs_head":
+                return "./subdir/sixmegs"
+
+            # None means ls will do an "ls -R" in hope of seeing some errors
+            return None
+
+        objects = self.fs.radosmo(["ls"], stdout=StringIO()).strip().split("\n")
+        objects = [o for o in objects if not is_ignored(o)]
+
+        # Find all objects with an OMAP header
+        omap_header_objs = []
+        for o in objects:
+            header = self.fs.radosmo(["getomapheader", o], stdout=StringIO())
+            # The rados CLI wraps the header output in a hex-printed style
+            header_bytes = int(re.match("header \((.+) bytes\)", header).group(1))
+            if header_bytes > 0:
+                omap_header_objs.append(o)
+
+        # Find all OMAP key/vals
+        omap_keys = []
+        for o in objects:
+            keys_str = self.fs.radosmo(["listomapkeys", o], stdout=StringIO())
+            if keys_str:
+                for key in keys_str.strip().split("\n"):
+                    if not is_ignored(o, key):
+                        omap_keys.append((o, key))
+
+        # Find objects that have data in their bodies
+        data_objects = []
+        for obj_id in objects:
+            stat_out = self.fs.radosmo(["stat", obj_id], stdout=StringIO())
+            size = int(re.match(".+, size (.+)$", stat_out).group(1))
+            if size > 0:
+                data_objects.append(obj_id)
+
+        # Define the various forms of damage we will inflict
+        class MetadataMutation(object):
+            def __init__(self, obj_id_, desc_, mutate_fn_, expectation_, ls_path=None):
+                self.obj_id = obj_id_
+                self.desc = desc_
+                self.mutate_fn = mutate_fn_
+                self.expectation = expectation_
+                if ls_path is None:
+                    self.ls_path = "."
+                else:
+                    self.ls_path = ls_path
+
+            def __eq__(self, other):
+                return self.desc == other.desc
+
+            def __hash__(self):
+                return hash(self.desc)
+
+        junk = "deadbeef" * 10
+        mutations = []
+
+        # Removals
+        for o in objects:
+            if o in [
+                # JournalPointers are auto-replaced if missing (same path as upgrade)
+                "400.00000000",
+                # Missing dirfrags for non-system dirs result in empty directory
+                "10000000000.00000000",
+                # PurgeQueue is auto-created if not found on startup
+                "500.00000000",
+                # open file table is auto-created if not found on startup
+                "mds0_openfiles.0"
+            ]:
+                expectation = NO_DAMAGE
+            else:
+                expectation = DAMAGED_ON_START
+
+            log.info("Expectation on rm '{0}' will be '{1}'".format(
+                o, expectation
+            ))
+
+            mutations.append(MetadataMutation(
+                o,
+                "Delete {0}".format(o),
+                lambda o=o: self.fs.radosm(["rm", o]),
+                expectation
+            ))
+
+        # Blatant corruptions
+        for obj_id in data_objects:
+            if obj_id == "500.00000000":
+                # purge queue corruption results in read-only FS
+                mutations.append(MetadataMutation(
+                    obj_id,
+                    "Corrupt {0}".format(obj_id),
+                    lambda o=obj_id: self.fs.radosm(["put", o, "-"], stdin=StringIO(junk)),
+                    READONLY
+                ))
+            else:
+                mutations.append(MetadataMutation(
+                    obj_id,
+                    "Corrupt {0}".format(obj_id),
+                    lambda o=obj_id: self.fs.radosm(["put", o, "-"], stdin=StringIO(junk)),
+                    DAMAGED_ON_START
+                ))
+
+        # Truncations
+        for o in data_objects:
+            if o == "500.00000000":
+                # The PurgeQueue is allowed to be empty: Journaler interprets
+                # an empty header object as an empty journal.
+                expectation = NO_DAMAGE
+            else:
+                expectation = DAMAGED_ON_START
+
+            mutations.append(
+                MetadataMutation(
+                    o,
+                    "Truncate {0}".format(o),
+                    lambda o=o: self.fs.radosm(["truncate", o, "0"]),
+                    expectation
+            ))
+
+        # OMAP value corruptions
+        for o, k in omap_keys:
+            if o.startswith("100."):
+                # Anything in rank 0's 'mydir'
+                expectation = DAMAGED_ON_START
+            else:
+                expectation = EIO_ON_LS
+
+            mutations.append(
+                MetadataMutation(
+                    o,
+                    "Corrupt omap key {0}:{1}".format(o, k),
+                    lambda o=o,k=k: self.fs.radosm(["setomapval", o, k, junk]),
+                    expectation,
+                    get_path(o, k)
+                )
+            )
+
+        # OMAP header corruptions
+        for o in omap_header_objs:
+            if re.match("60.\.00000000", o) \
+                    or o in ["1.00000000", "100.00000000", "mds0_sessionmap"]:
+                expectation = DAMAGED_ON_START
+            else:
+                expectation = NO_DAMAGE
+
+            log.info("Expectation on corrupt header '{0}' will be '{1}'".format(
+                o, expectation
+            ))
+
+            mutations.append(
+                MetadataMutation(
+                    o,
+                    "Corrupt omap header on {0}".format(o),
+                    lambda o=o: self.fs.radosm(["setomapheader", o, junk]),
+                    expectation
+                )
+            )
+
+        results = {}
+
+        for mutation in mutations:
+            log.info("Applying mutation '{0}'".format(mutation.desc))
+
+            # Reset MDS state
+            self.mount_a.umount_wait(force=True)
+            self.fs.fail()
+            self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
+
+            # Reset RADOS pool state
+            self.fs.radosm(['import', '-'], stdin=BytesIO(serialized))
+
+            # Inject the mutation
+            mutation.mutate_fn()
+
+            # Try starting the MDS
+            self.fs.set_joinable()
+
+            # How long we'll wait between starting a daemon and expecting
+            # it to make it through startup, and potentially declare itself
+            # damaged to the mon cluster.
+            startup_timeout = 60
+
+            if mutation.expectation not in (EIO_ON_LS, DAMAGED_ON_LS, NO_DAMAGE):
+                if mutation.expectation == DAMAGED_ON_START:
+                    # The MDS may pass through active before making it to damaged
+                    try:
+                        self.wait_until_true(lambda: self.is_marked_damaged(0), startup_timeout)
+                    except RuntimeError:
+                        pass
+
+                # Wait for MDS to either come up or go into damaged state
+                try:
+                    self.wait_until_true(lambda: self.is_marked_damaged(0) or self.fs.are_daemons_healthy(), startup_timeout)
+                except RuntimeError:
+                    crashed = False
+                    # Didn't make it to healthy or damaged, did it crash?
+                    for daemon_id, daemon in self.fs.mds_daemons.items():
+                        if daemon.proc and daemon.proc.finished:
+                            crashed = True
+                            log.error("Daemon {0} crashed!".format(daemon_id))
+                            daemon.proc = None  # So that subsequent stop() doesn't raise error
+                    if not crashed:
+                        # Didn't go health, didn't go damaged, didn't crash, so what?
+                        raise
+                    else:
+                        log.info("Result: Mutation '{0}' led to crash".format(mutation.desc))
+                        results[mutation] = CRASHED
+                        continue
+                if self.is_marked_damaged(0):
+                    log.info("Result: Mutation '{0}' led to DAMAGED state".format(mutation.desc))
+                    results[mutation] = DAMAGED_ON_START
+                    continue
+                else:
+                    log.info("Mutation '{0}' did not prevent MDS startup, attempting ls...".format(mutation.desc))
+            else:
+                try:
+                    self.wait_until_true(self.fs.are_daemons_healthy, 60)
+                except RuntimeError:
+                    log.info("Result: Mutation '{0}' should have left us healthy, actually not.".format(mutation.desc))
+                    if self.is_marked_damaged(0):
+                        results[mutation] = DAMAGED_ON_START
+                    else:
+                        results[mutation] = FAILED_SERVER
+                    continue
+                log.info("Daemons came up after mutation '{0}', proceeding to ls".format(mutation.desc))
+
+            # MDS is up, should go damaged on ls or client mount
+            self.mount_a.mount_wait()
+            if mutation.ls_path == ".":
+                proc = self.mount_a.run_shell(["ls", "-R", mutation.ls_path], wait=False)
+            else:
+                proc = self.mount_a.stat(mutation.ls_path, wait=False)
+
+            if mutation.expectation == DAMAGED_ON_LS:
+                try:
+                    self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
+                    log.info("Result: Mutation '{0}' led to DAMAGED state after ls".format(mutation.desc))
+                    results[mutation] = DAMAGED_ON_LS
+                except RuntimeError:
+                    if self.fs.are_daemons_healthy():
+                        log.error("Result: Failed to go damaged on mutation '{0}', actually went active".format(
+                            mutation.desc))
+                        results[mutation] = NO_DAMAGE
+                    else:
+                        log.error("Result: Failed to go damaged on mutation '{0}'".format(mutation.desc))
+                        results[mutation] = FAILED_SERVER
+            elif mutation.expectation == READONLY:
+                proc = self.mount_a.run_shell(["mkdir", "foo"], wait=False)
+                try:
+                    proc.wait()
+                except CommandFailedError:
+                    stderr = proc.stderr.getvalue()
+                    log.info(stderr)
+                    if "Read-only file system".lower() in stderr.lower():
+                        pass
+                    else:
+                        raise
+            else:
+                try:
+                    wait([proc], 20)
+                    log.info("Result: Mutation '{0}' did not caused DAMAGED state".format(mutation.desc))
+                    results[mutation] = NO_DAMAGE
+                except MaxWhileTries:
+                    log.info("Result: Failed to complete client IO on mutation '{0}'".format(mutation.desc))
+                    results[mutation] = FAILED_CLIENT
+                except CommandFailedError as e:
+                    if e.exitstatus == errno.EIO:
+                        log.info("Result: EIO on client")
+                        results[mutation] = EIO_ON_LS
+                    else:
+                        log.info("Result: unexpected error {0} on client".format(e))
+                        results[mutation] = FAILED_CLIENT
+
+            if mutation.expectation == EIO_ON_LS:
+                # EIOs mean something handled by DamageTable: assert that it has
+                # been populated
+                damage = json.loads(
+                    self.fs.mon_manager.raw_cluster_cmd(
+                        'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), "damage", "ls", '--format=json-pretty'))
+                if len(damage) == 0:
+                    results[mutation] = EIO_NO_DAMAGE
+
+        failures = [(mutation, result) for (mutation, result) in results.items() if mutation.expectation != result]
+        if failures:
+            log.error("{0} mutations had unexpected outcomes:".format(len(failures)))
+            for mutation, result in failures:
+                log.error("  Expected '{0}' actually '{1}' from '{2}'".format(
+                    mutation.expectation, result, mutation.desc
+                ))
+            raise RuntimeError("{0} mutations had unexpected outcomes".format(len(failures)))
+        else:
+            log.info("All {0} mutations had expected outcomes".format(len(mutations)))
+
+    def test_damaged_dentry(self):
+        # Damage to dentrys is interesting because it leaves the
+        # directory's `complete` flag in a subtle state where
+        # we have marked the dir complete in order that folks
+        # can access it, but in actual fact there is a dentry
+        # missing
+        self.mount_a.run_shell(["mkdir", "subdir/"])
+
+        self.mount_a.run_shell(["touch", "subdir/file_undamaged"])
+        self.mount_a.run_shell(["touch", "subdir/file_to_be_damaged"])
+
+        subdir_ino = self.mount_a.path_to_ino("subdir")
+
+        self.mount_a.umount_wait()
+        for mds_name in self.fs.get_active_names():
+            self.fs.mds_asok(["flush", "journal"], mds_name)
+
+        self.fs.fail()
+
+        # Corrupt a dentry
+        junk = "deadbeef" * 10
+        dirfrag_obj = "{0:x}.00000000".format(subdir_ino)
+        self.fs.radosm(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])
+
+        # Start up and try to list it
+        self.fs.set_joinable()
+        self.fs.wait_for_daemons()
+
+        self.mount_a.mount_wait()
+        dentries = self.mount_a.ls("subdir/")
+
+        # The damaged guy should have disappeared
+        self.assertEqual(dentries, ["file_undamaged"])
+
+        # I should get ENOENT if I try and read it normally, because
+        # the dir is considered complete
+        try:
+            self.mount_a.stat("subdir/file_to_be_damaged", wait=True)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.ENOENT)
+        else:
+            raise AssertionError("Expected ENOENT")
+
+        # The fact that there is damaged should have bee recorded
+        damage = json.loads(
+            self.fs.mon_manager.raw_cluster_cmd(
+                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
+                "damage", "ls", '--format=json-pretty'))
+        self.assertEqual(len(damage), 1)
+        damage_id = damage[0]['id']
+
+        # If I try to create a dentry with the same name as the damaged guy
+        # then that should be forbidden
+        try:
+            self.mount_a.touch("subdir/file_to_be_damaged")
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.EIO)
+        else:
+            raise AssertionError("Expected EIO")
+
+        # Attempting that touch will clear the client's complete flag, now
+        # when I stat it I'll get EIO instead of ENOENT
+        try:
+            self.mount_a.stat("subdir/file_to_be_damaged", wait=True)
+        except CommandFailedError as e:
+            if isinstance(self.mount_a, FuseMount):
+                self.assertEqual(e.exitstatus, errno.EIO)
+            else:
+                # Old kernel client handles this case differently
+                self.assertIn(e.exitstatus, [errno.ENOENT, errno.EIO])
+        else:
+            raise AssertionError("Expected EIO")
+
+        nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files")
+        self.assertEqual(nfiles, "2")
+
+        self.mount_a.umount_wait()
+
+        # Now repair the stats
+        scrub_json = self.fs.run_scrub(["start", "/subdir", "repair"])
+        log.info(json.dumps(scrub_json, indent=2))
+
+        self.assertNotEqual(scrub_json, None)
+        self.assertEqual(scrub_json["return_code"], 0)
+        self.assertEqual(self.fs.wait_until_scrub_complete(tag=scrub_json["scrub_tag"]), True)
+
+        # Check that the file count is now correct
+        self.mount_a.mount_wait()
+        nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files")
+        self.assertEqual(nfiles, "1")
+
+        # Clean up the omap object
+        self.fs.radosm(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])
+
+        # Clean up the damagetable entry
+        self.fs.mon_manager.raw_cluster_cmd(
+            'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
+            "damage", "rm", "{did}".format(did=damage_id))
+
+        # Now I should be able to create a file with the same name as the
+        # damaged guy if I want.
+        self.mount_a.touch("subdir/file_to_be_damaged")
+
+    def test_open_ino_errors(self):
+        """
+        That errors encountered during opening inos are properly propagated
+        """
+
+        self.mount_a.run_shell(["mkdir", "dir1"])
+        self.mount_a.run_shell(["touch", "dir1/file1"])
+        self.mount_a.run_shell(["mkdir", "dir2"])
+        self.mount_a.run_shell(["touch", "dir2/file2"])
+        self.mount_a.run_shell(["mkdir", "testdir"])
+        self.mount_a.run_shell(["ln", "dir1/file1", "testdir/hardlink1"])
+        self.mount_a.run_shell(["ln", "dir2/file2", "testdir/hardlink2"])
+
+        file1_ino = self.mount_a.path_to_ino("dir1/file1")
+        file2_ino = self.mount_a.path_to_ino("dir2/file2")
+        dir2_ino = self.mount_a.path_to_ino("dir2")
+
+        # Ensure everything is written to backing store
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(["flush", "journal"])
+
+        # Drop everything from the MDS cache
+        self.fs.fail()
+        self.fs.journal_tool(['journal', 'reset'], 0)
+        self.fs.set_joinable()
+        self.fs.wait_for_daemons()
+
+        self.mount_a.mount_wait()
+
+        # Case 1: un-decodeable backtrace
+
+        # Validate that the backtrace is present and decodable
+        self.fs.read_backtrace(file1_ino)
+        # Go corrupt the backtrace of alpha/target (used for resolving
+        # bravo/hardlink).
+        self.fs._write_data_xattr(file1_ino, "parent", "rhubarb")
+
+        # Check that touching the hardlink gives EIO
+        ran = self.mount_a.run_shell(["stat", "testdir/hardlink1"], wait=False)
+        try:
+            ran.wait()
+        except CommandFailedError:
+            self.assertTrue("Input/output error" in ran.stderr.getvalue())
+
+        # Check that an entry is created in the damage table
+        damage = json.loads(
+            self.fs.mon_manager.raw_cluster_cmd(
+                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
+                "damage", "ls", '--format=json-pretty'))
+        self.assertEqual(len(damage), 1)
+        self.assertEqual(damage[0]['damage_type'], "backtrace")
+        self.assertEqual(damage[0]['ino'], file1_ino)
+
+        self.fs.mon_manager.raw_cluster_cmd(
+            'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
+            "damage", "rm", str(damage[0]['id']))
+
+
+        # Case 2: missing dirfrag for the target inode
+
+        self.fs.radosm(["rm", "{0:x}.00000000".format(dir2_ino)])
+
+        # Check that touching the hardlink gives EIO
+        ran = self.mount_a.run_shell(["stat", "testdir/hardlink2"], wait=False)
+        try:
+            ran.wait()
+        except CommandFailedError:
+            self.assertTrue("Input/output error" in ran.stderr.getvalue())
+
+        # Check that an entry is created in the damage table
+        damage = json.loads(
+            self.fs.mon_manager.raw_cluster_cmd(
+                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
+                "damage", "ls", '--format=json-pretty'))
+        self.assertEqual(len(damage), 2)
+        if damage[0]['damage_type'] == "backtrace" :
+            self.assertEqual(damage[0]['ino'], file2_ino)
+            self.assertEqual(damage[1]['damage_type'], "dir_frag")
+            self.assertEqual(damage[1]['ino'], dir2_ino)
+        else:
+            self.assertEqual(damage[0]['damage_type'], "dir_frag")
+            self.assertEqual(damage[0]['ino'], dir2_ino)
+            self.assertEqual(damage[1]['damage_type'], "backtrace")
+            self.assertEqual(damage[1]['ino'], file2_ino)
+
+        for entry in damage:
+            self.fs.mon_manager.raw_cluster_cmd(
+                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
+                "damage", "rm", str(entry['id']))
diff --git a/qa/tasks/cephfs/test_data_scan.py b/qa/tasks/cephfs/test_data_scan.py
new file mode 100644
index 000000000..dcb7eda40
--- /dev/null
+++ b/qa/tasks/cephfs/test_data_scan.py
@@ -0,0 +1,688 @@
+
+"""
+Test our tools for recovering metadata from the data pool
+"""
+import json
+
+import logging
+import os
+import time
+import traceback
+
+from io import BytesIO, StringIO
+from collections import namedtuple, defaultdict
+from textwrap import dedent
+
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+
+log = logging.getLogger(__name__)
+
+
+ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
+
+
+class Workload(object):
+    def __init__(self, filesystem, mount):
+        self._mount = mount
+        self._filesystem = filesystem
+        self._initial_state = None
+
+        # Accumulate backtraces for every failed validation, and return them.  Backtraces
+        # are rather verbose, but we only see them when something breaks, and they
+        # let us see which check failed without having to decorate each check with
+        # a string
+        self._errors = []
+
+    def assert_equal(self, a, b):
+        try:
+            if a != b:
+                raise AssertionError("{0} != {1}".format(a, b))
+        except AssertionError as e:
+            self._errors.append(
+                ValidationError(e, traceback.format_exc(3))
+            )
+
+    def write(self):
+        """
+        Write the workload files to the mount
+        """
+        raise NotImplementedError()
+
+    def validate(self):
+        """
+        Read from the mount and validate that the workload files are present (i.e. have
+        survived or been reconstructed from the test scenario)
+        """
+        raise NotImplementedError()
+
+    def damage(self):
+        """
+        Damage the filesystem pools in ways that will be interesting to recover from.  By
+        default just wipe everything in the metadata pool
+        """
+        # Delete every object in the metadata pool
+        pool = self._filesystem.get_metadata_pool_name()
+        self._filesystem.rados(["purge", pool, '--yes-i-really-really-mean-it'])
+
+    def flush(self):
+        """
+        Called after client unmount, after write: flush whatever you want
+        """
+        self._filesystem.mds_asok(["flush", "journal"])
+
+
+class SimpleWorkload(Workload):
+    """
+    Single file, single directory, check that it gets recovered and so does its size
+    """
+    def write(self):
+        self._mount.run_shell(["mkdir", "subdir"])
+        self._mount.write_n_mb("subdir/sixmegs", 6)
+        self._initial_state = self._mount.stat("subdir/sixmegs")
+
+    def validate(self):
+        self._mount.run_shell(["ls", "subdir"], sudo=True)
+        st = self._mount.stat("subdir/sixmegs", sudo=True)
+        self.assert_equal(st['st_size'], self._initial_state['st_size'])
+        return self._errors
+
+
+class MovedFile(Workload):
+    def write(self):
+        # Create a file whose backtrace disagrees with his eventual position
+        # in the metadata.  We will see that he gets reconstructed in his
+        # original position according to his backtrace.
+        self._mount.run_shell(["mkdir", "subdir_alpha"])
+        self._mount.run_shell(["mkdir", "subdir_bravo"])
+        self._mount.write_n_mb("subdir_alpha/sixmegs", 6)
+        self._filesystem.mds_asok(["flush", "journal"])
+        self._mount.run_shell(["mv", "subdir_alpha/sixmegs", "subdir_bravo/sixmegs"])
+        self._initial_state = self._mount.stat("subdir_bravo/sixmegs")
+
+    def flush(self):
+        pass
+
+    def validate(self):
+        self.assert_equal(self._mount.ls(sudo=True), ["subdir_alpha"])
+        st = self._mount.stat("subdir_alpha/sixmegs", sudo=True)
+        self.assert_equal(st['st_size'], self._initial_state['st_size'])
+        return self._errors
+
+
+class BacktracelessFile(Workload):
+    def write(self):
+        self._mount.run_shell(["mkdir", "subdir"])
+        self._mount.write_n_mb("subdir/sixmegs", 6)
+        self._initial_state = self._mount.stat("subdir/sixmegs")
+
+    def flush(self):
+        # Never flush metadata, so backtrace won't be written
+        pass
+
+    def validate(self):
+        ino_name = "%x" % self._initial_state["st_ino"]
+
+        # The inode should be linked into lost+found because we had no path for it
+        self.assert_equal(self._mount.ls(sudo=True), ["lost+found"])
+        self.assert_equal(self._mount.ls("lost+found", sudo=True), [ino_name])
+        st = self._mount.stat(f"lost+found/{ino_name}", sudo=True)
+
+        # We might not have got the name or path, but we should still get the size
+        self.assert_equal(st['st_size'], self._initial_state['st_size'])
+
+        return self._errors
+
+
+class StripedStashedLayout(Workload):
+    def __init__(self, fs, m):
+        super(StripedStashedLayout, self).__init__(fs, m)
+
+        # Nice small stripes so we can quickly do our writes+validates
+        self.sc = 4
+        self.ss = 65536
+        self.os = 262144
+
+        self.interesting_sizes = [
+            # Exactly stripe_count objects will exist
+            self.os * self.sc,
+            # Fewer than stripe_count objects will exist
+            self.os * self.sc // 2,
+            self.os * (self.sc - 1) + self.os // 2,
+            self.os * (self.sc - 1) + self.os // 2 - 1,
+            self.os * (self.sc + 1) + self.os // 2,
+            self.os * (self.sc + 1) + self.os // 2 + 1,
+            # More than stripe_count objects will exist
+            self.os * self.sc + self.os * self.sc // 2
+        ]
+
+    def write(self):
+        # Create a dir with a striped layout set on it
+        self._mount.run_shell(["mkdir", "stripey"])
+
+        self._mount.setfattr("./stripey", "ceph.dir.layout",
+             "stripe_unit={ss} stripe_count={sc} object_size={os} pool={pool}".format(
+                 ss=self.ss, os=self.os, sc=self.sc,
+                 pool=self._filesystem.get_data_pool_name()
+             ))
+
+        # Write files, then flush metadata so that its layout gets written into an xattr
+        for i, n_bytes in enumerate(self.interesting_sizes):
+            self._mount.write_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes)
+            # This is really just validating the validator
+            self._mount.validate_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes)
+        self._filesystem.mds_asok(["flush", "journal"])
+
+        # Write another file in the same way, but this time don't flush the metadata,
+        # so that it won't have the layout xattr
+        self._mount.write_test_pattern("stripey/unflushed_file", 1024 * 512)
+        self._mount.validate_test_pattern("stripey/unflushed_file", 1024 * 512)
+
+        self._initial_state = {
+            "unflushed_ino": self._mount.path_to_ino("stripey/unflushed_file")
+        }
+
+    def flush(self):
+        # Pass because we already selectively flushed during write
+        pass
+
+    def validate(self):
+        # The first files should have been recovered into its original location
+        # with the correct layout: read back correct data
+        for i, n_bytes in enumerate(self.interesting_sizes):
+            try:
+                self._mount.validate_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes)
+            except CommandFailedError as e:
+                self._errors.append(
+                    ValidationError("File {0} (size {1}): {2}".format(i, n_bytes, e), traceback.format_exc(3))
+                )
+
+        # The unflushed file should have been recovered into lost+found without
+        # the correct layout: read back junk
+        ino_name = "%x" % self._initial_state["unflushed_ino"]
+        self.assert_equal(self._mount.ls("lost+found", sudo=True), [ino_name])
+        try:
+            self._mount.validate_test_pattern(os.path.join("lost+found", ino_name), 1024 * 512)
+        except CommandFailedError:
+            pass
+        else:
+            self._errors.append(
+                ValidationError("Unexpectedly valid data in unflushed striped file", "")
+            )
+
+        return self._errors
+
+
+class ManyFilesWorkload(Workload):
+    def __init__(self, filesystem, mount, file_count):
+        super(ManyFilesWorkload, self).__init__(filesystem, mount)
+        self.file_count = file_count
+
+    def write(self):
+        self._mount.run_shell(["mkdir", "subdir"])
+        for n in range(0, self.file_count):
+            self._mount.write_test_pattern("subdir/{0}".format(n), 6 * 1024 * 1024)
+
+    def validate(self):
+        for n in range(0, self.file_count):
+            try:
+                self._mount.validate_test_pattern("subdir/{0}".format(n), 6 * 1024 * 1024)
+            except CommandFailedError as e:
+                self._errors.append(
+                    ValidationError("File {0}: {1}".format(n, e), traceback.format_exc(3))
+                )
+
+        return self._errors
+
+
+class MovedDir(Workload):
+    def write(self):
+        # Create a nested dir that we will then move.  Two files with two different
+        # backtraces referring to the moved dir, claiming two different locations for
+        # it.  We will see that only one backtrace wins and the dir ends up with
+        # single linkage.
+        self._mount.run_shell(["mkdir", "-p", "grandmother/parent"])
+        self._mount.write_n_mb("grandmother/parent/orig_pos_file", 1)
+        self._filesystem.mds_asok(["flush", "journal"])
+        self._mount.run_shell(["mkdir", "grandfather"])
+        self._mount.run_shell(["mv", "grandmother/parent", "grandfather"])
+        self._mount.write_n_mb("grandfather/parent/new_pos_file", 2)
+        self._filesystem.mds_asok(["flush", "journal"])
+
+        self._initial_state = (
+            self._mount.stat("grandfather/parent/orig_pos_file"),
+            self._mount.stat("grandfather/parent/new_pos_file")
+        )
+
+    def validate(self):
+        root_files = self._mount.ls()
+        self.assert_equal(len(root_files), 1)
+        self.assert_equal(root_files[0] in ["grandfather", "grandmother"], True)
+        winner = root_files[0]
+        st_opf = self._mount.stat(f"{winner}/parent/orig_pos_file", sudo=True)
+        st_npf = self._mount.stat(f"{winner}/parent/new_pos_file", sudo=True)
+
+        self.assert_equal(st_opf['st_size'], self._initial_state[0]['st_size'])
+        self.assert_equal(st_npf['st_size'], self._initial_state[1]['st_size'])
+
+
+class MissingZerothObject(Workload):
+    def write(self):
+        self._mount.run_shell(["mkdir", "subdir"])
+        self._mount.write_n_mb("subdir/sixmegs", 6)
+        self._initial_state = self._mount.stat("subdir/sixmegs")
+
+    def damage(self):
+        super(MissingZerothObject, self).damage()
+        zeroth_id = "{0:x}.00000000".format(self._initial_state['st_ino'])
+        self._filesystem.rados(["rm", zeroth_id], pool=self._filesystem.get_data_pool_name())
+
+    def validate(self):
+        ino = self._initial_state['st_ino']
+        st = self._mount.stat(f"lost+found/{ino:x}", sudo=True)
+        self.assert_equal(st['st_size'], self._initial_state['st_size'])
+
+
+class NonDefaultLayout(Workload):
+    """
+    Check that the reconstruction copes with files that have a different
+    object size in their layout
+    """
+    def write(self):
+        self._mount.run_shell(["touch", "datafile"])
+        self._mount.setfattr("./datafile", "ceph.file.layout.object_size", "8388608")
+        self._mount.run_shell(["dd", "if=/dev/urandom", "of=./datafile", "bs=1M", "count=32"])
+        self._initial_state = self._mount.stat("datafile")
+
+    def validate(self):
+        # Check we got the layout reconstructed properly
+        object_size = int(self._mount.getfattr("./datafile", "ceph.file.layout.object_size", sudo=True))
+        self.assert_equal(object_size, 8388608)
+
+        # Check we got the file size reconstructed properly
+        st = self._mount.stat("datafile", sudo=True)
+        self.assert_equal(st['st_size'], self._initial_state['st_size'])
+
+
+class TestDataScan(CephFSTestCase):
+    MDSS_REQUIRED = 2
+
+    def is_marked_damaged(self, rank):
+        mds_map = self.fs.get_mds_map()
+        return rank in mds_map['damaged']
+
+    def _rebuild_metadata(self, workload, workers=1):
+        """
+        That when all objects in metadata pool are removed, we can rebuild a metadata pool
+        based on the contents of a data pool, and a client can see and read our files.
+        """
+
+        # First, inject some files
+
+        workload.write()
+
+        # Unmount the client and flush the journal: the tool should also cope with
+        # situations where there is dirty metadata, but we'll test that separately
+        self.mount_a.umount_wait()
+        workload.flush()
+
+        # Stop the MDS
+        self.fs.fail()
+
+        # After recovery, we need the MDS to not be strict about stats (in production these options
+        # are off by default, but in QA we need to explicitly disable them)
+        self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
+        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
+
+        # Apply any data damage the workload wants
+        workload.damage()
+
+        # Reset the MDS map in case multiple ranks were in play: recovery procedure
+        # only understands how to rebuild metadata under rank 0
+        self.fs.reset()
+
+        self.fs.set_joinable() # redundant with reset
+
+        def get_state(mds_id):
+            info = self.mds_cluster.get_mds_info(mds_id)
+            return info['state'] if info is not None else None
+
+        self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
+        for mds_id in self.fs.mds_ids:
+            self.wait_until_equal(
+                    lambda: get_state(mds_id),
+                    "up:standby",
+                    timeout=60)
+
+        self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
+        self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
+        self.fs.table_tool([self.fs.name + ":0", "reset", "inode"])
+
+        # Run the recovery procedure
+        if False:
+            with self.assertRaises(CommandFailedError):
+                # Normal reset should fail when no objects are present, we'll use --force instead
+                self.fs.journal_tool(["journal", "reset"], 0)
+
+        self.fs.journal_tool(["journal", "reset", "--force"], 0)
+        self.fs.data_scan(["init"])
+        self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()], worker_count=workers)
+        self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()], worker_count=workers)
+
+        # Mark the MDS repaired
+        self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
+
+        # Start the MDS
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+        log.info(str(self.mds_cluster.status()))
+
+        # Mount a client
+        self.mount_a.mount_wait()
+
+        # See that the files are present and correct
+        errors = workload.validate()
+        if errors:
+            log.error("Validation errors found: {0}".format(len(errors)))
+            for e in errors:
+                log.error(e.exception)
+                log.error(e.backtrace)
+            raise AssertionError("Validation failed, first error: {0}\n{1}".format(
+                errors[0].exception, errors[0].backtrace
+            ))
+
+    def test_rebuild_simple(self):
+        self._rebuild_metadata(SimpleWorkload(self.fs, self.mount_a))
+
+    def test_rebuild_moved_file(self):
+        self._rebuild_metadata(MovedFile(self.fs, self.mount_a))
+
+    def test_rebuild_backtraceless(self):
+        self._rebuild_metadata(BacktracelessFile(self.fs, self.mount_a))
+
+    def test_rebuild_moved_dir(self):
+        self._rebuild_metadata(MovedDir(self.fs, self.mount_a))
+
+    def test_rebuild_missing_zeroth(self):
+        self._rebuild_metadata(MissingZerothObject(self.fs, self.mount_a))
+
+    def test_rebuild_nondefault_layout(self):
+        self._rebuild_metadata(NonDefaultLayout(self.fs, self.mount_a))
+
+    def test_stashed_layout(self):
+        self._rebuild_metadata(StripedStashedLayout(self.fs, self.mount_a))
+
+    def _dirfrag_keys(self, object_id):
+        keys_str = self.fs.radosmo(["listomapkeys", object_id], stdout=StringIO())
+        if keys_str:
+            return keys_str.strip().split("\n")
+        else:
+            return []
+
+    def test_fragmented_injection(self):
+        """
+        That when injecting a dentry into a fragmented directory, we put it in the right fragment.
+        """
+
+        file_count = 100
+        file_names = ["%s" % n for n in range(0, file_count)]
+
+        # Make sure and disable dirfrag auto merging and splitting
+        self.fs.set_ceph_conf('mds', 'mds bal merge size', 0)
+        self.fs.set_ceph_conf('mds', 'mds bal split size', 100 * file_count)
+
+        # Create a directory of `file_count` files, each named after its
+        # decimal number and containing the string of its decimal number
+        self.mount_a.run_python(dedent("""
+        import os
+        path = os.path.join("{path}", "subdir")
+        os.mkdir(path)
+        for n in range(0, {file_count}):
+            open(os.path.join(path, "%s" % n), 'w').write("%s" % n)
+        """.format(
+            path=self.mount_a.mountpoint,
+            file_count=file_count
+        )))
+
+        dir_ino = self.mount_a.path_to_ino("subdir")
+
+        # Only one MDS should be active!
+        self.assertEqual(len(self.fs.get_active_names()), 1)
+
+        # Ensure that one directory is fragmented
+        mds_id = self.fs.get_active_names()[0]
+        self.fs.mds_asok(["dirfrag", "split", "/subdir", "0/0", "1"], mds_id)
+
+        # Flush journal and stop MDS
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(["flush", "journal"], mds_id)
+        self.fs.fail()
+
+        # Pick a dentry and wipe out its key
+        # Because I did a 1 bit split, I know one frag will be named <inode>.01000000
+        frag_obj_id = "{0:x}.01000000".format(dir_ino)
+        keys = self._dirfrag_keys(frag_obj_id)
+        victim_key = keys[7]  # arbitrary choice
+        log.info("victim_key={0}".format(victim_key))
+        victim_dentry = victim_key.split("_head")[0]
+        self.fs.radosm(["rmomapkey", frag_obj_id, victim_key])
+
+        # Start filesystem back up, observe that the file appears to be gone in an `ls`
+        self.fs.set_joinable()
+        self.fs.wait_for_daemons()
+        self.mount_a.mount_wait()
+        files = self.mount_a.run_shell(["ls", "subdir/"]).stdout.getvalue().strip().split("\n")
+        self.assertListEqual(sorted(files), sorted(list(set(file_names) - set([victim_dentry]))))
+
+        # Stop the filesystem
+        self.mount_a.umount_wait()
+        self.fs.fail()
+
+        # Run data-scan, observe that it inserts our dentry back into the correct fragment
+        # by checking the omap now has the dentry's key again
+        self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
+        self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()])
+        self.fs.data_scan(["scan_links"])
+        self.assertIn(victim_key, self._dirfrag_keys(frag_obj_id))
+
+        # Start the filesystem and check that the dentry we deleted is now once again visible
+        # and points to the correct file data.
+        self.fs.set_joinable()
+        self.fs.wait_for_daemons()
+        self.mount_a.mount_wait()
+        self.mount_a.run_shell(["ls", "-l", "subdir/"]) # debugging
+        # Use sudo because cephfs-data-scan will reinsert the dentry with root ownership, it can't know the real owner.
+        out = self.mount_a.run_shell_payload(f"cat subdir/{victim_dentry}", sudo=True).stdout.getvalue().strip()
+        self.assertEqual(out, victim_dentry)
+
+        # Finally, close the loop by checking our injected dentry survives a merge
+        mds_id = self.fs.get_active_names()[0]
+        self.mount_a.ls("subdir")  # Do an ls to ensure both frags are in cache so the merge will work
+        self.fs.mds_asok(["dirfrag", "merge", "/subdir", "0/0"], mds_id)
+        self.fs.mds_asok(["flush", "journal"], mds_id)
+        frag_obj_id = "{0:x}.00000000".format(dir_ino)
+        keys = self._dirfrag_keys(frag_obj_id)
+        self.assertListEqual(sorted(keys), sorted(["%s_head" % f for f in file_names]))
+
+        # run scrub to update and make sure rstat.rbytes info in subdir inode and dirfrag
+        # are matched
+        out_json = self.fs.run_scrub(["start", "/subdir", "repair,recursive"])
+        self.assertNotEqual(out_json, None)
+        self.assertEqual(out_json["return_code"], 0)
+        self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
+
+        # Remove the whole 'sudbdir' directory
+        self.mount_a.run_shell(["rm", "-rf", "subdir/"])
+
+    @for_teuthology
+    def test_parallel_execution(self):
+        self._rebuild_metadata(ManyFilesWorkload(self.fs, self.mount_a, 25), workers=7)
+
+    def test_pg_files(self):
+        """
+        That the pg files command tells us which files are associated with
+        a particular PG
+        """
+        file_count = 20
+        self.mount_a.run_shell(["mkdir", "mydir"])
+        self.mount_a.create_n_files("mydir/myfile", file_count)
+
+        # Some files elsewhere in the system that we will ignore
+        # to check that the tool is filtering properly
+        self.mount_a.run_shell(["mkdir", "otherdir"])
+        self.mount_a.create_n_files("otherdir/otherfile", file_count)
+
+        pgs_to_files = defaultdict(list)
+        # Rough (slow) reimplementation of the logic
+        for i in range(0, file_count):
+            file_path = "mydir/myfile_{0}".format(i)
+            ino = self.mount_a.path_to_ino(file_path)
+            obj = "{0:x}.{1:08x}".format(ino, 0)
+            pgid = json.loads(self.fs.mon_manager.raw_cluster_cmd(
+                "osd", "map", self.fs.get_data_pool_name(), obj,
+                "--format=json-pretty"
+            ))['pgid']
+            pgs_to_files[pgid].append(file_path)
+            log.info("{0}: {1}".format(file_path, pgid))
+
+        pg_count = self.fs.get_pool_pg_num(self.fs.get_data_pool_name())
+        for pg_n in range(0, pg_count):
+            pg_str = "{0}.{1:x}".format(self.fs.get_data_pool_id(), pg_n)
+            out = self.fs.data_scan(["pg_files", "mydir", pg_str])
+            lines = [l for l in out.split("\n") if l]
+            log.info("{0}: {1}".format(pg_str, lines))
+            self.assertSetEqual(set(lines), set(pgs_to_files[pg_str]))
+
+    def test_rebuild_linkage(self):
+        """
+        The scan_links command fixes linkage errors
+        """
+        self.mount_a.run_shell(["mkdir", "testdir1"])
+        self.mount_a.run_shell(["mkdir", "testdir2"])
+        dir1_ino = self.mount_a.path_to_ino("testdir1")
+        dir2_ino = self.mount_a.path_to_ino("testdir2")
+        dirfrag1_oid = "{0:x}.00000000".format(dir1_ino)
+        dirfrag2_oid = "{0:x}.00000000".format(dir2_ino)
+
+        self.mount_a.run_shell(["touch", "testdir1/file1"])
+        self.mount_a.run_shell(["ln", "testdir1/file1", "testdir1/link1"])
+        self.mount_a.run_shell(["ln", "testdir1/file1", "testdir2/link2"])
+
+        mds_id = self.fs.get_active_names()[0]
+        self.fs.mds_asok(["flush", "journal"], mds_id)
+
+        dirfrag1_keys = self._dirfrag_keys(dirfrag1_oid)
+
+        # introduce duplicated primary link
+        file1_key = "file1_head"
+        self.assertIn(file1_key, dirfrag1_keys)
+        file1_omap_data = self.fs.radosmo(["getomapval", dirfrag1_oid, file1_key, '-'])
+        self.fs.radosm(["setomapval", dirfrag2_oid, file1_key], stdin=BytesIO(file1_omap_data))
+        self.assertIn(file1_key, self._dirfrag_keys(dirfrag2_oid))
+
+        # remove a remote link, make inode link count incorrect
+        link1_key = 'link1_head'
+        self.assertIn(link1_key, dirfrag1_keys)
+        self.fs.radosm(["rmomapkey", dirfrag1_oid, link1_key])
+
+        # increase good primary link's version
+        self.mount_a.run_shell(["touch", "testdir1/file1"])
+        self.mount_a.umount_wait()
+
+        self.fs.mds_asok(["flush", "journal"], mds_id)
+        self.fs.fail()
+
+        # repair linkage errors
+        self.fs.data_scan(["scan_links"])
+
+        # primary link in testdir2 was deleted?
+        self.assertNotIn(file1_key, self._dirfrag_keys(dirfrag2_oid))
+
+        self.fs.set_joinable()
+        self.fs.wait_for_daemons()
+
+        self.mount_a.mount_wait()
+
+        # link count was adjusted?
+        file1_nlink = self.mount_a.path_to_nlink("testdir1/file1")
+        self.assertEqual(file1_nlink, 2)
+
+    def test_rebuild_inotable(self):
+        """
+        The scan_links command repair inotables
+        """
+        self.fs.set_max_mds(2)
+        self.fs.wait_for_daemons()
+
+        active_mds_names = self.fs.get_active_names()
+        mds0_id = active_mds_names[0]
+        mds1_id = active_mds_names[1]
+
+        self.mount_a.run_shell(["mkdir", "dir1"])
+        dir_ino = self.mount_a.path_to_ino("dir1")
+        self.mount_a.setfattr("dir1", "ceph.dir.pin", "1")
+        # wait for subtree migration
+
+        file_ino = 0;
+        while True:
+            time.sleep(1)
+            # allocate an inode from mds.1
+            self.mount_a.run_shell(["touch", "dir1/file1"])
+            file_ino = self.mount_a.path_to_ino("dir1/file1")
+            if file_ino >= (2 << 40):
+                break
+            self.mount_a.run_shell(["rm", "-f", "dir1/file1"])
+
+        self.mount_a.umount_wait()
+
+        self.fs.mds_asok(["flush", "journal"], mds0_id)
+        self.fs.mds_asok(["flush", "journal"], mds1_id)
+        self.fs.fail()
+
+        self.fs.radosm(["rm", "mds0_inotable"])
+        self.fs.radosm(["rm", "mds1_inotable"])
+
+        self.fs.data_scan(["scan_links", "--filesystem", self.fs.name])
+
+        mds0_inotable = json.loads(self.fs.table_tool([self.fs.name + ":0", "show", "inode"]))
+        self.assertGreaterEqual(
+            mds0_inotable['0']['data']['inotable']['free'][0]['start'], dir_ino)
+
+        mds1_inotable = json.loads(self.fs.table_tool([self.fs.name + ":1", "show", "inode"]))
+        self.assertGreaterEqual(
+            mds1_inotable['1']['data']['inotable']['free'][0]['start'], file_ino)
+
+    def test_rebuild_snaptable(self):
+        """
+        The scan_links command repair snaptable
+        """
+        self.fs.set_allow_new_snaps(True)
+
+        self.mount_a.run_shell(["mkdir", "dir1"])
+        self.mount_a.run_shell(["mkdir", "dir1/.snap/s1"])
+        self.mount_a.run_shell(["mkdir", "dir1/.snap/s2"])
+        self.mount_a.run_shell(["rmdir", "dir1/.snap/s2"])
+
+        self.mount_a.umount_wait()
+
+        mds0_id = self.fs.get_active_names()[0]
+        self.fs.mds_asok(["flush", "journal"], mds0_id)
+
+        # wait for mds to update removed snaps
+        time.sleep(10)
+
+        old_snaptable = json.loads(self.fs.table_tool([self.fs.name + ":0", "show", "snap"]))
+        # stamps may have minor difference
+        for item in old_snaptable['snapserver']['snaps']:
+            del item['stamp']
+
+        self.fs.radosm(["rm", "mds_snaptable"])
+        self.fs.data_scan(["scan_links", "--filesystem", self.fs.name])
+
+        new_snaptable = json.loads(self.fs.table_tool([self.fs.name + ":0", "show", "snap"]))
+        for item in new_snaptable['snapserver']['snaps']:
+            del item['stamp']
+        self.assertGreaterEqual(
+            new_snaptable['snapserver']['last_snap'], old_snaptable['snapserver']['last_snap'])
+        self.assertEqual(
+            new_snaptable['snapserver']['snaps'], old_snaptable['snapserver']['snaps'])
diff --git a/qa/tasks/cephfs/test_dump_tree.py b/qa/tasks/cephfs/test_dump_tree.py
new file mode 100644
index 000000000..48a2c6f00
--- /dev/null
+++ b/qa/tasks/cephfs/test_dump_tree.py
@@ -0,0 +1,66 @@
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+import random
+import os
+
+class TestDumpTree(CephFSTestCase):
+    def get_paths_to_ino(self):
+        inos = {}
+        p = self.mount_a.run_shell(["find", "./"])
+        paths = p.stdout.getvalue().strip().split()
+        for path in paths:
+            inos[path] = self.mount_a.path_to_ino(path, False)
+
+        return inos
+
+    def populate(self):
+        self.mount_a.run_shell(["git", "clone",
+                                "https://github.com/ceph/ceph-qa-suite"])
+
+    def test_basic(self):
+        self.mount_a.run_shell(["mkdir", "parent"])
+        self.mount_a.run_shell(["mkdir", "parent/child"])
+        self.mount_a.run_shell(["touch", "parent/child/file"])
+        self.mount_a.run_shell(["mkdir", "parent/child/grandchild"])
+        self.mount_a.run_shell(["touch", "parent/child/grandchild/file"])
+
+        inos = self.get_paths_to_ino()
+        tree = self.fs.mds_asok(["dump", "tree", "/parent/child", "1"])
+
+        target_inos = [inos["./parent/child"], inos["./parent/child/file"],
+                       inos["./parent/child/grandchild"]]
+
+        for ino in tree:
+            del target_inos[target_inos.index(ino['ino'])] # don't catch!
+            
+        assert(len(target_inos) == 0)
+
+    def test_random(self):
+        random.seed(0)
+
+        self.populate()
+        inos = self.get_paths_to_ino()
+        target = random.sample(inos.keys(), 1)[0]
+
+        if target != "./":
+            target = os.path.dirname(target)
+
+        subtree = [path for path in inos.keys() if path.startswith(target)]
+        target_inos = [inos[path] for path in subtree]
+        tree = self.fs.mds_asok(["dump", "tree", target[1:]])
+
+        for ino in tree:
+            del target_inos[target_inos.index(ino['ino'])] # don't catch!
+            
+        assert(len(target_inos) == 0)
+
+        target_depth = target.count('/')
+        maxdepth = max([path.count('/') for path in subtree]) - target_depth
+        depth = random.randint(0, maxdepth)
+        target_inos = [inos[path] for path in subtree \
+                       if path.count('/') <= depth + target_depth]
+        tree = self.fs.mds_asok(["dump", "tree", target[1:], str(depth)])
+
+        for ino in tree:
+            del target_inos[target_inos.index(ino['ino'])] # don't catch!
+            
+        assert(len(target_inos) == 0)
diff --git a/qa/tasks/cephfs/test_exports.py b/qa/tasks/cephfs/test_exports.py
new file mode 100644
index 000000000..d2421bedc
--- /dev/null
+++ b/qa/tasks/cephfs/test_exports.py
@@ -0,0 +1,519 @@
+import logging
+import random
+import time
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.orchestra.run import CommandFailedError
+
+log = logging.getLogger(__name__)
+
+class TestExports(CephFSTestCase):
+    MDSS_REQUIRED = 2
+    CLIENTS_REQUIRED = 2
+
+    def test_session_race(self):
+        """
+        Test session creation race.
+
+        See: https://tracker.ceph.com/issues/24072#change-113056
+        """
+
+        self.fs.set_max_mds(2)
+        status = self.fs.wait_for_daemons()
+
+        rank1 = self.fs.get_rank(rank=1, status=status)
+
+        # Create a directory that is pre-exported to rank 1
+        self.mount_a.run_shell(["mkdir", "-p", "a/aa"])
+        self.mount_a.setfattr("a", "ceph.dir.pin", "1")
+        self._wait_subtrees([('/a', 1)], status=status, rank=1)
+
+        # Now set the mds config to allow the race
+        self.fs.rank_asok(["config", "set", "mds_inject_migrator_session_race", "true"], rank=1)
+
+        # Now create another directory and try to export it
+        self.mount_b.run_shell(["mkdir", "-p", "b/bb"])
+        self.mount_b.setfattr("b", "ceph.dir.pin", "1")
+
+        time.sleep(5)
+
+        # Now turn off the race so that it doesn't wait again
+        self.fs.rank_asok(["config", "set", "mds_inject_migrator_session_race", "false"], rank=1)
+
+        # Now try to create a session with rank 1 by accessing a dir known to
+        # be there, if buggy, this should cause the rank 1 to crash:
+        self.mount_b.run_shell(["ls", "a"])
+
+        # Check if rank1 changed (standby tookover?)
+        new_rank1 = self.fs.get_rank(rank=1)
+        self.assertEqual(rank1['gid'], new_rank1['gid'])
+
+class TestExportPin(CephFSTestCase):
+    MDSS_REQUIRED = 3
+    CLIENTS_REQUIRED = 1
+
+    def setUp(self):
+        CephFSTestCase.setUp(self)
+
+        self.fs.set_max_mds(3)
+        self.status = self.fs.wait_for_daemons()
+
+        self.mount_a.run_shell_payload("mkdir -p 1/2/3/4")
+
+    def test_noop(self):
+        self.mount_a.setfattr("1", "ceph.dir.pin", "-1")
+        time.sleep(30) # for something to not happen
+        self._wait_subtrees([], status=self.status)
+
+    def test_negative(self):
+        self.mount_a.setfattr("1", "ceph.dir.pin", "-2341")
+        time.sleep(30) # for something to not happen
+        self._wait_subtrees([], status=self.status)
+
+    def test_empty_pin(self):
+        self.mount_a.setfattr("1/2/3/4", "ceph.dir.pin", "1")
+        time.sleep(30) # for something to not happen
+        self._wait_subtrees([], status=self.status)
+
+    def test_trivial(self):
+        self.mount_a.setfattr("1", "ceph.dir.pin", "1")
+        self._wait_subtrees([('/1', 1)], status=self.status, rank=1)
+
+    def test_export_targets(self):
+        self.mount_a.setfattr("1", "ceph.dir.pin", "1")
+        self._wait_subtrees([('/1', 1)], status=self.status, rank=1)
+        self.status = self.fs.status()
+        r0 = self.status.get_rank(self.fs.id, 0)
+        self.assertTrue(sorted(r0['export_targets']) == [1])
+
+    def test_redundant(self):
+        # redundant pin /1/2 to rank 1
+        self.mount_a.setfattr("1", "ceph.dir.pin", "1")
+        self._wait_subtrees([('/1', 1)], status=self.status, rank=1)
+        self.mount_a.setfattr("1/2", "ceph.dir.pin", "1")
+        self._wait_subtrees([('/1', 1), ('/1/2', 1)], status=self.status, rank=1)
+
+    def test_reassignment(self):
+        self.mount_a.setfattr("1/2", "ceph.dir.pin", "1")
+        self._wait_subtrees([('/1/2', 1)], status=self.status, rank=1)
+        self.mount_a.setfattr("1/2", "ceph.dir.pin", "0")
+        self._wait_subtrees([('/1/2', 0)], status=self.status, rank=0)
+
+    def test_phantom_rank(self):
+        self.mount_a.setfattr("1", "ceph.dir.pin", "0")
+        self.mount_a.setfattr("1/2", "ceph.dir.pin", "10")
+        time.sleep(30) # wait for nothing weird to happen
+        self._wait_subtrees([('/1', 0)], status=self.status)
+
+    def test_nested(self):
+        self.mount_a.setfattr("1", "ceph.dir.pin", "1")
+        self.mount_a.setfattr("1/2", "ceph.dir.pin", "0")
+        self.mount_a.setfattr("1/2/3", "ceph.dir.pin", "2")
+        self._wait_subtrees([('/1', 1), ('/1/2', 0), ('/1/2/3', 2)], status=self.status, rank=2)
+
+    def test_nested_unset(self):
+        self.mount_a.setfattr("1", "ceph.dir.pin", "1")
+        self.mount_a.setfattr("1/2", "ceph.dir.pin", "2")
+        self._wait_subtrees([('/1', 1), ('/1/2', 2)], status=self.status, rank=1)
+        self.mount_a.setfattr("1/2", "ceph.dir.pin", "-1")
+        self._wait_subtrees([('/1', 1)], status=self.status, rank=1)
+
+    def test_rename(self):
+        self.mount_a.setfattr("1", "ceph.dir.pin", "1")
+        self.mount_a.run_shell_payload("mkdir -p 9/8/7")
+        self.mount_a.setfattr("9/8", "ceph.dir.pin", "0")
+        self._wait_subtrees([('/1', 1), ("/9/8", 0)], status=self.status, rank=0)
+        self.mount_a.run_shell_payload("mv 9/8 1/2")
+        self._wait_subtrees([('/1', 1), ("/1/2/8", 0)], status=self.status, rank=0)
+
+    def test_getfattr(self):
+        # pin /1 to rank 0
+        self.mount_a.setfattr("1", "ceph.dir.pin", "1")
+        self.mount_a.setfattr("1/2", "ceph.dir.pin", "0")
+        self._wait_subtrees([('/1', 1), ('/1/2', 0)], status=self.status, rank=1)
+
+        if not isinstance(self.mount_a, FuseMount):
+            p = self.mount_a.client_remote.sh('uname -r', wait=True)
+            dir_pin = self.mount_a.getfattr("1", "ceph.dir.pin")
+            log.debug("mount.getfattr('1','ceph.dir.pin'): %s " % dir_pin)
+            if str(p) < "5" and not(dir_pin):
+                self.skipTest("Kernel does not support getting the extended attribute ceph.dir.pin")
+        self.assertEqual(self.mount_a.getfattr("1", "ceph.dir.pin"), '1')
+        self.assertEqual(self.mount_a.getfattr("1/2", "ceph.dir.pin"), '0')
+
+    def test_export_pin_cache_drop(self):
+        """
+        That the export pin does not prevent empty (nothing in cache) subtree merging.
+        """
+
+        self.mount_a.setfattr("1", "ceph.dir.pin", "0")
+        self.mount_a.setfattr("1/2", "ceph.dir.pin", "1")
+        self._wait_subtrees([('/1', 0), ('/1/2', 1)], status=self.status)
+        self.mount_a.umount_wait() # release all caps
+        def _drop():
+            self.fs.ranks_tell(["cache", "drop"], status=self.status)
+        # drop cache multiple times to clear replica pins
+        self._wait_subtrees([], status=self.status, action=_drop)
+
+class TestEphemeralPins(CephFSTestCase):
+    MDSS_REQUIRED = 3
+    CLIENTS_REQUIRED = 1
+
+    def setUp(self):
+        CephFSTestCase.setUp(self)
+
+        self.config_set('mds', 'mds_export_ephemeral_random', True)
+        self.config_set('mds', 'mds_export_ephemeral_distributed', True)
+        self.config_set('mds', 'mds_export_ephemeral_random_max', 1.0)
+
+        self.mount_a.run_shell_payload("""
+set -e
+
+# Use up a random number of inode numbers so the ephemeral pinning is not the same every test.
+mkdir .inode_number_thrash
+count=$((RANDOM % 1024))
+for ((i = 0; i < count; i++)); do touch .inode_number_thrash/$i; done
+rm -rf .inode_number_thrash
+""")
+
+        self.fs.set_max_mds(3)
+        self.status = self.fs.wait_for_daemons()
+
+    def _setup_tree(self, path="tree", export=-1, distributed=False, random=0.0, count=100, wait=True):
+        return self.mount_a.run_shell_payload(f"""
+set -ex
+mkdir -p {path}
+{f"setfattr -n ceph.dir.pin -v {export} {path}" if export >= 0 else ""}
+{f"setfattr -n ceph.dir.pin.distributed -v 1 {path}" if distributed else ""}
+{f"setfattr -n ceph.dir.pin.random -v {random} {path}" if random > 0.0 else ""}
+for ((i = 0; i < {count}; i++)); do
+    mkdir -p "{path}/$i"
+    echo file > "{path}/$i/file"
+done
+""", wait=wait)
+
+    def test_ephemeral_pin_dist_override(self):
+        """
+        That an ephemeral distributed pin overrides a normal export pin.
+        """
+
+        self._setup_tree(distributed=True)
+        subtrees = self._wait_distributed_subtrees(3 * 2, status=self.status, rank="all")
+        for s in subtrees:
+            path = s['dir']['path']
+            if path == '/tree':
+                self.assertTrue(s['distributed_ephemeral_pin'])
+
+    def test_ephemeral_pin_dist_override_pin(self):
+        """
+        That an export pin overrides an ephemerally pinned directory.
+        """
+
+        self._setup_tree(distributed=True)
+        subtrees = self._wait_distributed_subtrees(3 * 2, status=self.status, rank="all")
+        self.mount_a.setfattr("tree", "ceph.dir.pin", "0")
+        time.sleep(15)
+        subtrees = self._get_subtrees(status=self.status, rank=0)
+        for s in subtrees:
+            path = s['dir']['path']
+            if path == '/tree':
+                self.assertEqual(s['auth_first'], 0)
+                self.assertFalse(s['distributed_ephemeral_pin'])
+        # it has been merged into /tree
+
+    def test_ephemeral_pin_dist_off(self):
+        """
+        That turning off ephemeral distributed pin merges subtrees.
+        """
+
+        self._setup_tree(distributed=True)
+        self._wait_distributed_subtrees(3 * 2, status=self.status, rank="all")
+        self.mount_a.setfattr("tree", "ceph.dir.pin.distributed", "0")
+        time.sleep(15)
+        subtrees = self._get_subtrees(status=self.status, rank=0)
+        for s in subtrees:
+            path = s['dir']['path']
+            if path == '/tree':
+                self.assertFalse(s['distributed_ephemeral_pin'])
+
+
+    def test_ephemeral_pin_dist_conf_off(self):
+        """
+        That turning off ephemeral distributed pin config prevents distribution.
+        """
+
+        self._setup_tree()
+        self.config_set('mds', 'mds_export_ephemeral_distributed', False)
+        self.mount_a.setfattr("tree", "ceph.dir.pin.distributed", "1")
+        time.sleep(15)
+        subtrees = self._get_subtrees(status=self.status, rank=0)
+        for s in subtrees:
+            path = s['dir']['path']
+            if path == '/tree':
+                self.assertFalse(s['distributed_ephemeral_pin'])
+
+    def _test_ephemeral_pin_dist_conf_off_merge(self):
+        """
+        That turning off ephemeral distributed pin config merges subtrees.
+        FIXME: who triggers the merge?
+        """
+
+        self._setup_tree(distributed=True)
+        self._wait_distributed_subtrees(3 * 2, status=self.status, rank="all")
+        self.config_set('mds', 'mds_export_ephemeral_distributed', False)
+        self._wait_subtrees([('/tree', 0)], timeout=60, status=self.status)
+
+    def test_ephemeral_pin_dist_override_before(self):
+        """
+        That a conventional export pin overrides the distributed policy _before_ distributed policy is set.
+        """
+
+        count = 10
+        self._setup_tree(count=count)
+        test = []
+        for i in range(count):
+            path = f"tree/{i}"
+            self.mount_a.setfattr(path, "ceph.dir.pin", "1")
+            test.append(("/"+path, 1))
+        self.mount_a.setfattr("tree", "ceph.dir.pin.distributed", "1")
+        time.sleep(15) # for something to not happen...
+        self._wait_subtrees(test, timeout=60, status=self.status, rank="all", path="/tree/")
+
+    def test_ephemeral_pin_dist_override_after(self):
+        """
+        That a conventional export pin overrides the distributed policy _after_ distributed policy is set.
+        """
+
+        self._setup_tree(distributed=True)
+        self._wait_distributed_subtrees(3 * 2, status=self.status, rank="all")
+        test = []
+        for i in range(10):
+            path = f"tree/{i}"
+            self.mount_a.setfattr(path, "ceph.dir.pin", "1")
+            test.append(("/"+path, 1))
+        self._wait_subtrees(test, timeout=60, status=self.status, rank="all", path="/tree/")
+
+    def test_ephemeral_pin_dist_failover(self):
+        """
+        That MDS failover does not cause unnecessary migrations.
+        """
+
+        # pin /tree so it does not export during failover
+        self._setup_tree(distributed=True)
+        self._wait_distributed_subtrees(3 * 2, status=self.status, rank="all")
+        #test = [(s['dir']['path'], s['auth_first']) for s in subtrees]
+        before = self.fs.ranks_perf(lambda p: p['mds']['exported'])
+        log.info(f"export stats: {before}")
+        self.fs.rank_fail(rank=1)
+        self.status = self.fs.wait_for_daemons()
+        time.sleep(10) # waiting for something to not happen
+        after = self.fs.ranks_perf(lambda p: p['mds']['exported'])
+        log.info(f"export stats: {after}")
+        self.assertEqual(before, after)
+
+    def test_ephemeral_pin_distribution(self):
+        """
+        That ephemerally pinned subtrees are somewhat evenly distributed.
+        """
+
+        max_mds = 3
+        frags = 128
+
+        self.fs.set_max_mds(max_mds)
+        self.status = self.fs.wait_for_daemons()
+
+        self.config_set('mds', 'mds_export_ephemeral_distributed_factor', (frags-1) / max_mds)
+        self._setup_tree(count=1000, distributed=True)
+
+        subtrees = self._wait_distributed_subtrees(frags, status=self.status, rank="all")
+        nsubtrees = len(subtrees)
+
+        # Check if distribution is uniform
+        rank0 = list(filter(lambda x: x['auth_first'] == 0, subtrees))
+        rank1 = list(filter(lambda x: x['auth_first'] == 1, subtrees))
+        rank2 = list(filter(lambda x: x['auth_first'] == 2, subtrees))
+        self.assertGreaterEqual(len(rank0)/nsubtrees, 0.15)
+        self.assertGreaterEqual(len(rank1)/nsubtrees, 0.15)
+        self.assertGreaterEqual(len(rank2)/nsubtrees, 0.15)
+
+
+    def test_ephemeral_random(self):
+        """
+        That 100% randomness causes all children to be pinned.
+        """
+        self._setup_tree(random=1.0)
+        self._wait_random_subtrees(100, status=self.status, rank="all")
+
+    def test_ephemeral_random_max(self):
+        """
+        That the config mds_export_ephemeral_random_max is not exceeded.
+        """
+
+        r = 0.5
+        count = 1000
+        self._setup_tree(count=count, random=r)
+        subtrees = self._wait_random_subtrees(int(r*count*.75), status=self.status, rank="all")
+        self.config_set('mds', 'mds_export_ephemeral_random_max', 0.01)
+        self._setup_tree(path="tree/new", count=count)
+        time.sleep(30) # for something not to happen...
+        subtrees = self._get_subtrees(status=self.status, rank="all", path="tree/new/")
+        self.assertLessEqual(len(subtrees), int(.01*count*1.25))
+
+    def test_ephemeral_random_max_config(self):
+        """
+        That the config mds_export_ephemeral_random_max config rejects new OOB policies.
+        """
+
+        self.config_set('mds', 'mds_export_ephemeral_random_max', 0.01)
+        try:
+            p = self._setup_tree(count=1, random=0.02, wait=False)
+            p.wait()
+        except CommandFailedError as e:
+            log.info(f"{e}")
+            self.assertIn("Invalid", p.stderr.getvalue())
+        else:
+            raise RuntimeError("mds_export_ephemeral_random_max ignored!")
+
+    def test_ephemeral_random_dist(self):
+        """
+        That ephemeral distributed pin overrides ephemeral random pin
+        """
+
+        self._setup_tree(random=1.0, distributed=True)
+        self._wait_distributed_subtrees(3 * 2, status=self.status)
+
+        time.sleep(15)
+        subtrees = self._get_subtrees(status=self.status, rank=0)
+        for s in subtrees:
+            path = s['dir']['path']
+            if path.startswith('/tree'):
+                self.assertFalse(s['random_ephemeral_pin'])
+
+    def test_ephemeral_random_pin_override_before(self):
+        """
+        That a conventional export pin overrides the random policy before creating new directories.
+        """
+
+        self._setup_tree(count=0, random=1.0)
+        self._setup_tree(path="tree/pin", count=10, export=1)
+        self._wait_subtrees([("/tree/pin", 1)], status=self.status, rank=1, path="/tree/pin")
+
+    def test_ephemeral_random_pin_override_after(self):
+        """
+        That a conventional export pin overrides the random policy after creating new directories.
+        """
+
+        count = 10
+        self._setup_tree(count=0, random=1.0)
+        self._setup_tree(path="tree/pin", count=count)
+        self._wait_random_subtrees(count+1, status=self.status, rank="all")
+        self.mount_a.setfattr("tree/pin", "ceph.dir.pin", "1")
+        self._wait_subtrees([("/tree/pin", 1)], status=self.status, rank=1, path="/tree/pin")
+
+    def test_ephemeral_randomness(self):
+        """
+        That the randomness is reasonable.
+        """
+
+        r = random.uniform(0.25, 0.75) # ratios don't work for small r!
+        count = 1000
+        self._setup_tree(count=count, random=r)
+        subtrees = self._wait_random_subtrees(int(r*count*.50), status=self.status, rank="all")
+        time.sleep(30) # for max to not be exceeded
+        subtrees = self._wait_random_subtrees(int(r*count*.50), status=self.status, rank="all")
+        self.assertLessEqual(len(subtrees), int(r*count*1.50))
+
+    def test_ephemeral_random_cache_drop(self):
+        """
+        That the random ephemeral pin does not prevent empty (nothing in cache) subtree merging.
+        """
+
+        count = 100
+        self._setup_tree(count=count, random=1.0)
+        self._wait_random_subtrees(count, status=self.status, rank="all")
+        self.mount_a.umount_wait() # release all caps
+        def _drop():
+            self.fs.ranks_tell(["cache", "drop"], status=self.status)
+        self._wait_subtrees([], status=self.status, action=_drop)
+
+    def test_ephemeral_random_failover(self):
+        """
+        That the random ephemeral pins stay pinned across MDS failover.
+        """
+
+        count = 100
+        r = 0.5
+        self._setup_tree(count=count, random=r)
+        # wait for all random subtrees to be created, not a specific count
+        time.sleep(30)
+        subtrees = self._wait_random_subtrees(1, status=self.status, rank=1)
+        before = [(s['dir']['path'], s['auth_first']) for s in subtrees]
+        before.sort();
+
+        self.fs.rank_fail(rank=1)
+        self.status = self.fs.wait_for_daemons()
+
+        time.sleep(30) # waiting for something to not happen
+        subtrees = self._wait_random_subtrees(1, status=self.status, rank=1)
+        after = [(s['dir']['path'], s['auth_first']) for s in subtrees]
+        after.sort();
+        log.info(f"subtrees before: {before}")
+        log.info(f"subtrees after: {after}")
+
+        self.assertEqual(before, after)
+
+    def test_ephemeral_pin_grow_mds(self):
+        """
+        That consistent hashing works to reduce the number of migrations.
+        """
+
+        self.fs.set_max_mds(2)
+        self.status = self.fs.wait_for_daemons()
+
+        self._setup_tree(random=1.0)
+        subtrees_old = self._wait_random_subtrees(100, status=self.status, rank="all")
+
+        self.fs.set_max_mds(3)
+        self.status = self.fs.wait_for_daemons()
+        
+        # Sleeping for a while to allow the ephemeral pin migrations to complete
+        time.sleep(30)
+        
+        subtrees_new = self._wait_random_subtrees(100, status=self.status, rank="all")
+        count = 0
+        for old_subtree in subtrees_old:
+            for new_subtree in subtrees_new:
+                if (old_subtree['dir']['path'] == new_subtree['dir']['path']) and (old_subtree['auth_first'] != new_subtree['auth_first']):
+                    count = count + 1
+                    break
+
+        log.info("{0} migrations have occured due to the cluster resizing".format(count))
+        # ~50% of subtrees from the two rank will migrate to another rank
+        self.assertLessEqual((count/len(subtrees_old)), (0.5)*1.25) # with 25% overbudget
+
+    def test_ephemeral_pin_shrink_mds(self):
+        """
+        That consistent hashing works to reduce the number of migrations.
+        """
+
+        self.fs.set_max_mds(3)
+        self.status = self.fs.wait_for_daemons()
+
+        self._setup_tree(random=1.0)
+        subtrees_old = self._wait_random_subtrees(100, status=self.status, rank="all")
+
+        self.fs.set_max_mds(2)
+        self.status = self.fs.wait_for_daemons()
+        time.sleep(30)
+
+        subtrees_new = self._wait_random_subtrees(100, status=self.status, rank="all")
+        count = 0
+        for old_subtree in subtrees_old:
+            for new_subtree in subtrees_new:
+                if (old_subtree['dir']['path'] == new_subtree['dir']['path']) and (old_subtree['auth_first'] != new_subtree['auth_first']):
+                    count = count + 1
+                    break
+
+        log.info("{0} migrations have occured due to the cluster resizing".format(count))
+        # rebalancing from 3 -> 2 may cause half of rank 0/1 to move and all of rank 2
+        self.assertLessEqual((count/len(subtrees_old)), (1.0/3.0/2.0 + 1.0/3.0/2.0 + 1.0/3.0)*1.25) # aka .66 with 25% overbudget
diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py
new file mode 100644
index 000000000..745489309
--- /dev/null
+++ b/qa/tasks/cephfs/test_failover.py
@@ -0,0 +1,800 @@
+import time
+import signal
+import logging
+import operator
+from random import randint
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.exceptions import CommandFailedError
+from tasks.cephfs.fuse_mount import FuseMount
+
+log = logging.getLogger(__name__)
+
+class TestClusterAffinity(CephFSTestCase):
+    CLIENTS_REQUIRED = 0
+    MDSS_REQUIRED = 4
+
+    def _verify_join_fs(self, target, status=None):
+        if status is None:
+            status = self.fs.wait_for_daemons(timeout=30)
+            log.debug("%s", status)
+        target = sorted(target, key=operator.itemgetter('name'))
+        log.info("target = %s", target)
+        current = list(status.get_all())
+        current = sorted(current, key=operator.itemgetter('name'))
+        log.info("current = %s", current)
+        self.assertEqual(len(current), len(target))
+        for i in range(len(current)):
+            for attr in target[i]:
+                self.assertIn(attr, current[i])
+                self.assertEqual(target[i][attr], current[i][attr])
+
+    def _change_target_state(self, state, name, changes):
+        for entity in state:
+            if entity['name'] == name:
+                for k, v in changes.items():
+                    entity[k] = v
+                return
+        self.fail("no entity")
+
+    def _verify_init(self):
+        status = self.fs.status()
+        log.info("status = {0}".format(status))
+        target = [{'join_fscid': -1, 'name': info['name']} for info in status.get_all()]
+        self._verify_join_fs(target, status=status)
+        return (status, target)
+
+    def _reach_target(self, target):
+        def takeover():
+            try:
+                self._verify_join_fs(target)
+                return True
+            except AssertionError as e:
+                log.debug("%s", e)
+                return False
+        self.wait_until_true(takeover, 30)
+
+    def test_join_fs_runtime(self):
+        """
+        That setting mds_join_fs at runtime affects the cluster layout.
+        """
+        status, target = self._verify_init()
+        standbys = list(status.get_standbys())
+        self.config_set('mds.'+standbys[0]['name'], 'mds_join_fs', 'cephfs')
+        self._change_target_state(target, standbys[0]['name'], {'join_fscid': self.fs.id, 'state': 'up:active'})
+        self._reach_target(target)
+
+    def test_join_fs_unset(self):
+        """
+        That unsetting mds_join_fs will cause failover if another high-affinity standby exists.
+        """
+        status, target = self._verify_init()
+        standbys = list(status.get_standbys())
+        names = (standbys[0]['name'], standbys[1]['name'])
+        self.config_set('mds.'+names[0], 'mds_join_fs', 'cephfs')
+        self.config_set('mds.'+names[1], 'mds_join_fs', 'cephfs')
+        self._change_target_state(target, names[0], {'join_fscid': self.fs.id})
+        self._change_target_state(target, names[1], {'join_fscid': self.fs.id})
+        self._reach_target(target)
+        status = self.fs.status()
+        active = self.fs.get_active_names(status=status)[0]
+        self.assertIn(active, names)
+        self.config_rm('mds.'+active, 'mds_join_fs')
+        self._change_target_state(target, active, {'join_fscid': -1})
+        new_active = (set(names) - set((active,))).pop()
+        self._change_target_state(target, new_active, {'state': 'up:active'})
+        self._reach_target(target)
+
+    def test_join_fs_drop(self):
+        """
+        That unsetting mds_join_fs will not cause failover if no high-affinity standby exists.
+        """
+        status, target = self._verify_init()
+        standbys = list(status.get_standbys())
+        active = standbys[0]['name']
+        self.config_set('mds.'+active, 'mds_join_fs', 'cephfs')
+        self._change_target_state(target, active, {'join_fscid': self.fs.id, 'state': 'up:active'})
+        self._reach_target(target)
+        self.config_rm('mds.'+active, 'mds_join_fs')
+        self._change_target_state(target, active, {'join_fscid': -1})
+        self._reach_target(target)
+
+    def test_join_fs_vanilla(self):
+        """
+        That a vanilla standby is preferred over others with mds_join_fs set to another fs.
+        """
+        # After Octopus is EOL, we can remove this setting:
+        self.fs.set_allow_multifs()
+        fs2 = self.mds_cluster.newfs(name="cephfs2")
+        status, target = self._verify_init()
+        active = self.fs.get_active_names(status=status)[0]
+        standbys = [info['name'] for info in status.get_standbys()]
+        victim = standbys.pop()
+        # Set a bogus fs on the others
+        for mds in standbys:
+            self.config_set('mds.'+mds, 'mds_join_fs', 'cephfs2')
+            self._change_target_state(target, mds, {'join_fscid': fs2.id})
+        self.fs.rank_fail()
+        self._change_target_state(target, victim, {'state': 'up:active'})
+        self._reach_target(target)
+        status = self.fs.status()
+        active = self.fs.get_active_names(status=status)[0]
+        self.assertEqual(active, victim)
+
+    def test_join_fs_last_resort(self):
+        """
+        That a standby with mds_join_fs set to another fs is still used if necessary.
+        """
+        status, target = self._verify_init()
+        standbys = [info['name'] for info in status.get_standbys()]
+        for mds in standbys:
+            self.config_set('mds.'+mds, 'mds_join_fs', 'cephfs2')
+        # After Octopus is EOL, we can remove this setting:
+        self.fs.set_allow_multifs()
+        fs2 = self.mds_cluster.newfs(name="cephfs2")
+        for mds in standbys:
+            self._change_target_state(target, mds, {'join_fscid': fs2.id})
+        self.fs.rank_fail()
+        status = self.fs.status()
+        ranks = list(self.fs.get_ranks(status=status))
+        self.assertEqual(len(ranks), 1)
+        self.assertIn(ranks[0]['name'], standbys)
+        # Note that we would expect the former active to reclaim its spot, but
+        # we're not testing that here.
+
+    def test_join_fs_steady(self):
+        """
+        That a sole MDS with mds_join_fs set will come back as active eventually even after failover.
+        """
+        status, target = self._verify_init()
+        active = self.fs.get_active_names(status=status)[0]
+        self.config_set('mds.'+active, 'mds_join_fs', 'cephfs')
+        self._change_target_state(target, active, {'join_fscid': self.fs.id})
+        self._reach_target(target)
+        self.fs.rank_fail()
+        self._reach_target(target)
+
+    def test_join_fs_standby_replay(self):
+        """
+        That a standby-replay daemon with weak affinity is replaced by a stronger one.
+        """
+        status, target = self._verify_init()
+        standbys = [info['name'] for info in status.get_standbys()]
+        self.config_set('mds.'+standbys[0], 'mds_join_fs', 'cephfs')
+        self._change_target_state(target, standbys[0], {'join_fscid': self.fs.id, 'state': 'up:active'})
+        self._reach_target(target)
+        self.fs.set_allow_standby_replay(True)
+        status = self.fs.status()
+        standbys = [info['name'] for info in status.get_standbys()]
+        self.config_set('mds.'+standbys[0], 'mds_join_fs', 'cephfs')
+        self._change_target_state(target, standbys[0], {'join_fscid': self.fs.id, 'state': 'up:standby-replay'})
+        self._reach_target(target)
+
+class TestClusterResize(CephFSTestCase):
+    CLIENTS_REQUIRED = 0
+    MDSS_REQUIRED = 3
+
+    def test_grow(self):
+        """
+        That the MDS cluster grows after increasing max_mds.
+        """
+
+        # Need all my standbys up as well as the active daemons
+        # self.wait_for_daemon_start() necessary?
+
+        self.fs.grow(2)
+        self.fs.grow(3)
+
+
+    def test_shrink(self):
+        """
+        That the MDS cluster shrinks automatically after decreasing max_mds.
+        """
+
+        self.fs.grow(3)
+        self.fs.shrink(1)
+
+    def test_up_less_than_max(self):
+        """
+        That a health warning is generated when max_mds is greater than active count.
+        """
+
+        status = self.fs.status()
+        mdss = [info['gid'] for info in status.get_all()]
+        self.fs.set_max_mds(len(mdss)+1)
+        self.wait_for_health("MDS_UP_LESS_THAN_MAX", 30)
+        self.fs.shrink(2)
+        self.wait_for_health_clear(30)
+
+    def test_down_health(self):
+        """
+        That marking a FS down does not generate a health warning
+        """
+
+        self.fs.set_down()
+        try:
+            self.wait_for_health("", 30)
+            raise RuntimeError("got health warning?")
+        except RuntimeError as e:
+            if "Timed out after" in str(e):
+                pass
+            else:
+                raise
+
+    def test_down_twice(self):
+        """
+        That marking a FS down twice does not wipe old_max_mds.
+        """
+
+        self.fs.grow(2)
+        self.fs.set_down()
+        self.fs.wait_for_daemons()
+        self.fs.set_down(False)
+        self.assertEqual(self.fs.get_var("max_mds"), 2)
+        self.fs.wait_for_daemons(timeout=60)
+
+    def test_down_grow(self):
+        """
+        That setting max_mds undoes down.
+        """
+
+        self.fs.set_down()
+        self.fs.wait_for_daemons()
+        self.fs.grow(2)
+        self.fs.wait_for_daemons()
+
+    def test_down(self):
+        """
+        That down setting toggles and sets max_mds appropriately.
+        """
+
+        self.fs.set_down()
+        self.fs.wait_for_daemons()
+        self.assertEqual(self.fs.get_var("max_mds"), 0)
+        self.fs.set_down(False)
+        self.assertEqual(self.fs.get_var("max_mds"), 1)
+        self.fs.wait_for_daemons()
+        self.assertEqual(self.fs.get_var("max_mds"), 1)
+
+    def test_hole(self):
+        """
+        Test that a hole cannot be created in the FS ranks.
+        """
+
+        fscid = self.fs.id
+
+        self.fs.grow(2)
+
+        # Now add a delay which should slow down how quickly rank 1 stops
+        self.config_set('mds', 'ms_inject_delay_max', '5.0')
+        self.config_set('mds', 'ms_inject_delay_probability', '1.0')
+        self.fs.set_max_mds(1)
+        log.info("status = {0}".format(self.fs.status()))
+
+        # Don't wait for rank 1 to stop
+        self.fs.set_max_mds(3)
+        log.info("status = {0}".format(self.fs.status()))
+
+        # Now check that the mons didn't try to promote a standby to rank 2
+        self.fs.set_max_mds(2)
+        status = self.fs.status()
+        try:
+            status = self.fs.wait_for_daemons(timeout=90)
+            ranks = set([info['rank'] for info in status.get_ranks(fscid)])
+            self.assertEqual(ranks, set([0, 1]))
+        finally:
+            log.info("status = {0}".format(status))
+
+    def test_thrash(self):
+        """
+        Test that thrashing max_mds does not fail.
+        """
+
+        max_mds = 2
+        for i in range(0, 100):
+            self.fs.set_max_mds(max_mds)
+            max_mds = (max_mds+1)%3+1
+
+        self.fs.wait_for_daemons(timeout=90)
+
+class TestFailover(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 2
+
+    def test_simple(self):
+        """
+        That when the active MDS is killed, a standby MDS is promoted into
+        its rank after the grace period.
+
+        This is just a simple unit test, the harder cases are covered
+        in thrashing tests.
+        """
+
+        # Need all my standbys up as well as the active daemons
+        self.wait_for_daemon_start()
+
+        (original_active, ) = self.fs.get_active_names()
+        original_standbys = self.mds_cluster.get_standby_daemons()
+
+        # Kill the rank 0 daemon's physical process
+        self.fs.mds_stop(original_active)
+
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+        # Wait until the monitor promotes his replacement
+        def promoted():
+            active = self.fs.get_active_names()
+            return active and active[0] in original_standbys
+
+        log.info("Waiting for promotion of one of the original standbys {0}".format(
+            original_standbys))
+        self.wait_until_true(
+            promoted,
+            timeout=grace*2)
+
+        # Start the original rank 0 daemon up again, see that he becomes a standby
+        self.fs.mds_restart(original_active)
+        self.wait_until_true(
+            lambda: original_active in self.mds_cluster.get_standby_daemons(),
+            timeout=60  # Approximately long enough for MDS to start and mon to notice
+        )
+
+    def test_client_abort(self):
+        """
+        That a client will respect fuse_require_active_mds and error out
+        when the cluster appears to be unavailable.
+        """
+
+        if not isinstance(self.mount_a, FuseMount):
+            self.skipTest("Requires FUSE client to inject client metadata")
+
+        require_active = self.fs.get_config("fuse_require_active_mds", service_type="mon").lower() == "true"
+        if not require_active:
+            self.skipTest("fuse_require_active_mds is not set")
+
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+        # Check it's not laggy to begin with
+        (original_active, ) = self.fs.get_active_names()
+        self.assertNotIn("laggy_since", self.fs.status().get_mds(original_active))
+
+        self.mounts[0].umount_wait()
+
+        # Control: that we can mount and unmount usually, while the cluster is healthy
+        self.mounts[0].mount_wait()
+        self.mounts[0].umount_wait()
+
+        # Stop the daemon processes
+        self.fs.mds_stop()
+
+        # Wait for everyone to go laggy
+        def laggy():
+            mdsmap = self.fs.get_mds_map()
+            for info in mdsmap['info'].values():
+                if "laggy_since" not in info:
+                    return False
+
+            return True
+
+        self.wait_until_true(laggy, grace * 2)
+        with self.assertRaises(CommandFailedError):
+            self.mounts[0].mount_wait()
+
+    def test_standby_count_wanted(self):
+        """
+        That cluster health warnings are generated by insufficient standbys available.
+        """
+
+        # Need all my standbys up as well as the active daemons
+        self.wait_for_daemon_start()
+
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+        standbys = self.mds_cluster.get_standby_daemons()
+        self.assertGreaterEqual(len(standbys), 1)
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)))
+
+        # Kill a standby and check for warning
+        victim = standbys.pop()
+        self.fs.mds_stop(victim)
+        log.info("waiting for insufficient standby daemon warning")
+        self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
+
+        # restart the standby, see that he becomes a standby, check health clears
+        self.fs.mds_restart(victim)
+        self.wait_until_true(
+            lambda: victim in self.mds_cluster.get_standby_daemons(),
+            timeout=60  # Approximately long enough for MDS to start and mon to notice
+        )
+        self.wait_for_health_clear(timeout=30)
+
+        # Set it one greater than standbys ever seen
+        standbys = self.mds_cluster.get_standby_daemons()
+        self.assertGreaterEqual(len(standbys), 1)
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)+1))
+        log.info("waiting for insufficient standby daemon warning")
+        self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
+
+        # Set it to 0
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', '0')
+        self.wait_for_health_clear(timeout=30)
+
+    def test_discontinuous_mdsmap(self):
+        """
+        That discontinuous mdsmap does not affect failover.
+        See http://tracker.ceph.com/issues/24856.
+        """
+        self.fs.set_max_mds(2)
+        status = self.fs.wait_for_daemons()
+
+        self.mount_a.umount_wait()
+
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+        monc_timeout = float(self.fs.get_config("mon_client_ping_timeout", service_type="mds"))
+
+        mds_0 = self.fs.get_rank(rank=0, status=status)
+        self.fs.rank_freeze(True, rank=0) # prevent failover
+        self.fs.rank_signal(signal.SIGSTOP, rank=0, status=status)
+        self.wait_until_true(
+            lambda: "laggy_since" in self.fs.get_rank(),
+            timeout=grace * 2
+        )
+
+        self.fs.rank_fail(rank=1)
+        self.fs.wait_for_state('up:resolve', rank=1, timeout=30)
+
+        # Make sure of mds_0's monitor connection gets reset
+        time.sleep(monc_timeout * 2)
+
+        # Continue rank 0, it will get discontinuous mdsmap
+        self.fs.rank_signal(signal.SIGCONT, rank=0)
+        self.wait_until_true(
+            lambda: "laggy_since" not in self.fs.get_rank(rank=0),
+            timeout=grace * 2
+        )
+
+        # mds.b will be stuck at 'reconnect' state if snapserver gets confused
+        # by discontinuous mdsmap
+        self.fs.wait_for_state('up:active', rank=1, timeout=30)
+        self.assertEqual(mds_0['gid'], self.fs.get_rank(rank=0)['gid'])
+        self.fs.rank_freeze(False, rank=0)
+
+    def test_connect_bootstrapping(self):
+        self.config_set("mds", "mds_sleep_rank_change", 10000000.0)
+        self.config_set("mds", "mds_connect_bootstrapping", True)
+        self.fs.set_max_mds(2)
+        self.fs.wait_for_daemons()
+        self.fs.rank_fail(rank=0)
+        # rank 0 will get stuck in up:resolve, see https://tracker.ceph.com/issues/53194
+        self.fs.wait_for_daemons()
+
+
+class TestStandbyReplay(CephFSTestCase):
+    CLIENTS_REQUIRED = 0
+    MDSS_REQUIRED = 4
+
+    def _confirm_no_replay(self):
+        status = self.fs.status()
+        _ = len(list(status.get_standbys()))
+        self.assertEqual(0, len(list(self.fs.get_replays(status=status))))
+        return status
+
+    def _confirm_single_replay(self, full=True, status=None, retries=3):
+        status = self.fs.wait_for_daemons(status=status)
+        ranks = sorted(self.fs.get_mds_map(status=status)['in'])
+        replays = list(self.fs.get_replays(status=status))
+        checked_replays = set()
+        for rank in ranks:
+            has_replay = False
+            for replay in replays:
+                if replay['rank'] == rank:
+                    self.assertFalse(has_replay)
+                    has_replay = True
+                    checked_replays.add(replay['gid'])
+            if full and not has_replay:
+                if retries <= 0:
+                    raise RuntimeError("rank "+str(rank)+" has no standby-replay follower")
+                else:
+                    retries = retries-1
+                    time.sleep(2)
+        self.assertEqual(checked_replays, set(info['gid'] for info in replays))
+        return status
+
+    def _check_replay_takeover(self, status, rank=0):
+        replay = self.fs.get_replay(rank=rank, status=status)
+        new_status = self.fs.wait_for_daemons()
+        new_active = self.fs.get_rank(rank=rank, status=new_status)
+        if replay:
+            self.assertEqual(replay['gid'], new_active['gid'])
+        else:
+            # double check takeover came from a standby (or some new daemon via restart)
+            found = False
+            for info in status.get_standbys():
+                if info['gid'] == new_active['gid']:
+                    found = True
+                    break
+            if not found:
+                for info in status.get_all():
+                    self.assertNotEqual(info['gid'], new_active['gid'])
+        return new_status
+
+    def test_standby_replay_singleton(self):
+        """
+        That only one MDS becomes standby-replay.
+        """
+
+        self._confirm_no_replay()
+        self.fs.set_allow_standby_replay(True)
+        time.sleep(30)
+        self._confirm_single_replay()
+
+    def test_standby_replay_damaged(self):
+        """
+        That a standby-replay daemon can cause the rank to go damaged correctly.
+        """
+
+        self._confirm_no_replay()
+        self.config_set("mds", "mds_standby_replay_damaged", True)
+        self.fs.set_allow_standby_replay(True)
+        self.wait_until_true(
+            lambda: len(self.fs.get_damaged()) > 0,
+            timeout=30
+        )
+        status = self.fs.status()
+        self.assertListEqual([], list(self.fs.get_ranks(status=status)))
+        self.assertListEqual([0], self.fs.get_damaged(status=status))
+
+    def test_standby_replay_disable(self):
+        """
+        That turning off allow_standby_replay fails all standby-replay daemons.
+        """
+
+        self._confirm_no_replay()
+        self.fs.set_allow_standby_replay(True)
+        time.sleep(30)
+        self._confirm_single_replay()
+        self.fs.set_allow_standby_replay(False)
+        self._confirm_no_replay()
+
+    def test_standby_replay_singleton_fail(self):
+        """
+        That failures don't violate singleton constraint.
+        """
+
+        self._confirm_no_replay()
+        self.fs.set_allow_standby_replay(True)
+        status = self._confirm_single_replay()
+
+        for i in range(10):
+            time.sleep(randint(1, 5))
+            self.fs.rank_restart(status=status)
+            status = self._check_replay_takeover(status)
+            status = self._confirm_single_replay(status=status)
+
+        for i in range(10):
+            time.sleep(randint(1, 5))
+            self.fs.rank_fail()
+            status = self._check_replay_takeover(status)
+            status = self._confirm_single_replay(status=status)
+
+    def test_standby_replay_singleton_fail_multimds(self):
+        """
+        That failures don't violate singleton constraint with multiple actives.
+        """
+
+        status = self._confirm_no_replay()
+        new_max_mds = randint(2, len(list(status.get_standbys())))
+        self.fs.set_max_mds(new_max_mds)
+        self.fs.wait_for_daemons() # wait for actives to come online!
+        self.fs.set_allow_standby_replay(True)
+        status = self._confirm_single_replay(full=False)
+
+        for i in range(10):
+            time.sleep(randint(1, 5))
+            victim = randint(0, new_max_mds-1)
+            self.fs.rank_restart(rank=victim, status=status)
+            status = self._check_replay_takeover(status, rank=victim)
+            status = self._confirm_single_replay(status=status, full=False)
+
+        for i in range(10):
+            time.sleep(randint(1, 5))
+            victim = randint(0, new_max_mds-1)
+            self.fs.rank_fail(rank=victim)
+            status = self._check_replay_takeover(status, rank=victim)
+            status = self._confirm_single_replay(status=status, full=False)
+
+    def test_standby_replay_failure(self):
+        """
+        That the failure of a standby-replay daemon happens cleanly
+        and doesn't interrupt anything else.
+        """
+
+        status = self._confirm_no_replay()
+        self.fs.set_max_mds(1)
+        self.fs.set_allow_standby_replay(True)
+        status = self._confirm_single_replay()
+
+        for i in range(10):
+            time.sleep(randint(1, 5))
+            victim = self.fs.get_replay(status=status)
+            self.fs.mds_restart(mds_id=victim['name'])
+            status = self._confirm_single_replay(status=status)
+
+    def test_standby_replay_prepare_beacon(self):
+        """
+        That a MDSMonitor::prepare_beacon handles standby-replay daemons
+        correctly without removing the standby. (Note, usually a standby-replay
+        beacon will just be replied to by MDSMonitor::preprocess_beacon.)
+        """
+
+        status = self._confirm_no_replay()
+        self.fs.set_max_mds(1)
+        self.fs.set_allow_standby_replay(True)
+        status = self._confirm_single_replay()
+        replays = list(status.get_replays(self.fs.id))
+        self.assertEqual(len(replays), 1)
+        self.config_set('mds.'+replays[0]['name'], 'mds_inject_health_dummy', True)
+        time.sleep(10) # for something not to happen...
+        status = self._confirm_single_replay()
+        replays2 = list(status.get_replays(self.fs.id))
+        self.assertEqual(replays[0]['gid'], replays2[0]['gid'])
+
+    def test_rank_stopped(self):
+        """
+        That when a rank is STOPPED, standby replays for
+        that rank get torn down
+        """
+
+        status = self._confirm_no_replay()
+        standby_count = len(list(status.get_standbys()))
+        self.fs.set_max_mds(2)
+        self.fs.set_allow_standby_replay(True)
+        status = self._confirm_single_replay()
+
+        self.fs.set_max_mds(1) # stop rank 1
+
+        status = self._confirm_single_replay()
+        self.assertTrue(standby_count, len(list(status.get_standbys())))
+
+
+class TestMultiFilesystems(CephFSTestCase):
+    CLIENTS_REQUIRED = 2
+    MDSS_REQUIRED = 4
+
+    # We'll create our own filesystems and start our own daemons
+    REQUIRE_FILESYSTEM = False
+
+    def setUp(self):
+        super(TestMultiFilesystems, self).setUp()
+        self.mds_cluster.mon_manager.raw_cluster_cmd("fs", "flag", "set",
+            "enable_multiple", "true",
+            "--yes-i-really-mean-it")
+
+    def _setup_two(self):
+        fs_a = self.mds_cluster.newfs(name="alpha")
+        fs_b = self.mds_cluster.newfs(name="bravo")
+
+        self.mds_cluster.mds_restart()
+
+        # Wait for both filesystems to go healthy
+        fs_a.wait_for_daemons()
+        fs_b.wait_for_daemons()
+
+        # Reconfigure client auth caps
+        for mount in self.mounts:
+            self.mds_cluster.mon_manager.raw_cluster_cmd_result(
+                'auth', 'caps', "client.{0}".format(mount.client_id),
+                'mds', 'allow',
+                'mon', 'allow r',
+                'osd', 'allow rw pool={0}, allow rw pool={1}'.format(
+                    fs_a.get_data_pool_name(), fs_b.get_data_pool_name()))
+
+        return fs_a, fs_b
+
+    def test_clients(self):
+        fs_a, fs_b = self._setup_two()
+
+        # Mount a client on fs_a
+        self.mount_a.mount_wait(cephfs_name=fs_a.name)
+        self.mount_a.write_n_mb("pad.bin", 1)
+        self.mount_a.write_n_mb("test.bin", 2)
+        a_created_ino = self.mount_a.path_to_ino("test.bin")
+        self.mount_a.create_files()
+
+        # Mount a client on fs_b
+        self.mount_b.mount_wait(cephfs_name=fs_b.name)
+        self.mount_b.write_n_mb("test.bin", 1)
+        b_created_ino = self.mount_b.path_to_ino("test.bin")
+        self.mount_b.create_files()
+
+        # Check that a non-default filesystem mount survives an MDS
+        # failover (i.e. that map subscription is continuous, not
+        # just the first time), reproduces #16022
+        old_fs_b_mds = fs_b.get_active_names()[0]
+        self.mds_cluster.mds_stop(old_fs_b_mds)
+        self.mds_cluster.mds_fail(old_fs_b_mds)
+        fs_b.wait_for_daemons()
+        background = self.mount_b.write_background()
+        # Raise exception if the write doesn't finish (i.e. if client
+        # has not kept up with MDS failure)
+        try:
+            self.wait_until_true(lambda: background.finished, timeout=30)
+        except RuntimeError:
+            # The mount is stuck, we'll have to force it to fail cleanly
+            background.stdin.close()
+            self.mount_b.umount_wait(force=True)
+            raise
+
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        # See that the client's files went into the correct pool
+        self.assertTrue(fs_a.data_objects_present(a_created_ino, 1024 * 1024))
+        self.assertTrue(fs_b.data_objects_present(b_created_ino, 1024 * 1024))
+
+    def test_standby(self):
+        fs_a, fs_b = self._setup_two()
+
+        # Assert that the remaining two MDS daemons are now standbys
+        a_daemons = fs_a.get_active_names()
+        b_daemons = fs_b.get_active_names()
+        self.assertEqual(len(a_daemons), 1)
+        self.assertEqual(len(b_daemons), 1)
+        original_a = a_daemons[0]
+        original_b = b_daemons[0]
+        expect_standby_daemons = set(self.mds_cluster.mds_ids) - (set(a_daemons) | set(b_daemons))
+
+        # Need all my standbys up as well as the active daemons
+        self.wait_for_daemon_start()
+        self.assertEqual(expect_standby_daemons, self.mds_cluster.get_standby_daemons())
+
+        # Kill fs_a's active MDS, see a standby take over
+        self.mds_cluster.mds_stop(original_a)
+        self.mds_cluster.mon_manager.raw_cluster_cmd("mds", "fail", original_a)
+        self.wait_until_equal(lambda: len(fs_a.get_active_names()), 1, 30,
+                              reject_fn=lambda v: v > 1)
+        # Assert that it's a *different* daemon that has now appeared in the map for fs_a
+        self.assertNotEqual(fs_a.get_active_names()[0], original_a)
+
+        # Kill fs_b's active MDS, see a standby take over
+        self.mds_cluster.mds_stop(original_b)
+        self.mds_cluster.mon_manager.raw_cluster_cmd("mds", "fail", original_b)
+        self.wait_until_equal(lambda: len(fs_b.get_active_names()), 1, 30,
+                              reject_fn=lambda v: v > 1)
+        # Assert that it's a *different* daemon that has now appeared in the map for fs_a
+        self.assertNotEqual(fs_b.get_active_names()[0], original_b)
+
+        # Both of the original active daemons should be gone, and all standbys used up
+        self.assertEqual(self.mds_cluster.get_standby_daemons(), set())
+
+        # Restart the ones I killed, see them reappear as standbys
+        self.mds_cluster.mds_restart(original_a)
+        self.mds_cluster.mds_restart(original_b)
+        self.wait_until_true(
+            lambda: {original_a, original_b} == self.mds_cluster.get_standby_daemons(),
+            timeout=30
+        )
+
+    def test_grow_shrink(self):
+        # Usual setup...
+        fs_a, fs_b = self._setup_two()
+
+        # Increase max_mds on fs_b, see a standby take up the role
+        fs_b.set_max_mds(2)
+        self.wait_until_equal(lambda: len(fs_b.get_active_names()), 2, 30,
+                              reject_fn=lambda v: v > 2 or v < 1)
+
+        # Increase max_mds on fs_a, see a standby take up the role
+        fs_a.set_max_mds(2)
+        self.wait_until_equal(lambda: len(fs_a.get_active_names()), 2, 30,
+                              reject_fn=lambda v: v > 2 or v < 1)
+
+        # Shrink fs_b back to 1, see a daemon go back to standby
+        fs_b.set_max_mds(1)
+        self.wait_until_equal(lambda: len(fs_b.get_active_names()), 1, 30,
+                              reject_fn=lambda v: v > 2 or v < 1)
+
+        # Grow fs_a up to 3, see the former fs_b daemon join it.
+        fs_a.set_max_mds(3)
+        self.wait_until_equal(lambda: len(fs_a.get_active_names()), 3, 60,
+                              reject_fn=lambda v: v > 3 or v < 2)
diff --git a/qa/tasks/cephfs/test_flush.py b/qa/tasks/cephfs/test_flush.py
new file mode 100644
index 000000000..17cb84970
--- /dev/null
+++ b/qa/tasks/cephfs/test_flush.py
@@ -0,0 +1,112 @@
+
+from textwrap import dedent
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO
+
+
+class TestFlush(CephFSTestCase):
+    def test_flush(self):
+        self.mount_a.run_shell(["mkdir", "mydir"])
+        self.mount_a.run_shell(["touch", "mydir/alpha"])
+        dir_ino = self.mount_a.path_to_ino("mydir")
+        file_ino = self.mount_a.path_to_ino("mydir/alpha")
+
+        # Unmount the client so that it isn't still holding caps
+        self.mount_a.umount_wait()
+
+        # Before flush, the dirfrag object does not exist
+        with self.assertRaises(ObjectNotFound):
+            self.fs.list_dirfrag(dir_ino)
+
+        # Before flush, the file's backtrace has not been written
+        with self.assertRaises(ObjectNotFound):
+            self.fs.read_backtrace(file_ino)
+
+        # Before flush, there are no dentries in the root
+        self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
+
+        # Execute flush
+        flush_data = self.fs.mds_asok(["flush", "journal"])
+        self.assertEqual(flush_data['return_code'], 0)
+
+        # After flush, the dirfrag object has been created
+        dir_list = self.fs.list_dirfrag(dir_ino)
+        self.assertEqual(dir_list, ["alpha_head"])
+
+        # And the 'mydir' dentry is in the root
+        self.assertEqual(self.fs.list_dirfrag(ROOT_INO), ['mydir_head'])
+
+        # ...and the data object has its backtrace
+        backtrace = self.fs.read_backtrace(file_ino)
+        self.assertEqual(['alpha', 'mydir'], [a['dname'] for a in backtrace['ancestors']])
+        self.assertEqual([dir_ino, 1], [a['dirino'] for a in backtrace['ancestors']])
+        self.assertEqual(file_ino, backtrace['ino'])
+
+        # ...and the journal is truncated to just a single subtreemap from the
+        # newly created segment
+        summary_output = self.fs.journal_tool(["event", "get", "summary"], 0)
+        try:
+            self.assertEqual(summary_output,
+                             dedent(
+                                 """
+                                 Events by type:
+                                   SUBTREEMAP: 1
+                                 Errors: 0
+                                 """
+                             ).strip())
+        except AssertionError:
+            # In some states, flushing the journal will leave you
+            # an extra event from locks a client held.   This is
+            # correct behaviour: the MDS is flushing the journal,
+            # it's just that new events are getting added too.
+            # In this case, we should nevertheless see a fully
+            # empty journal after a second flush.
+            self.assertEqual(summary_output,
+                             dedent(
+                                 """
+                                 Events by type:
+                                   SUBTREEMAP: 1
+                                   UPDATE: 1
+                                 Errors: 0
+                                 """
+                             ).strip())
+            flush_data = self.fs.mds_asok(["flush", "journal"])
+            self.assertEqual(flush_data['return_code'], 0)
+            self.assertEqual(self.fs.journal_tool(["event", "get", "summary"], 0),
+                             dedent(
+                                 """
+                                 Events by type:
+                                   SUBTREEMAP: 1
+                                 Errors: 0
+                                 """
+                             ).strip())
+
+        # Now for deletion!
+        # We will count the RADOS deletions and MDS file purges, to verify that
+        # the expected behaviour is happening as a result of the purge
+        initial_dels = self.fs.mds_asok(['perf', 'dump', 'objecter'])['objecter']['osdop_delete']
+        initial_purges = self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']['strays_enqueued']
+
+        # Use a client to delete a file
+        self.mount_a.mount_wait()
+        self.mount_a.run_shell(["rm", "-rf", "mydir"])
+
+        # Flush the journal so that the directory inode can be purged
+        flush_data = self.fs.mds_asok(["flush", "journal"])
+        self.assertEqual(flush_data['return_code'], 0)
+
+        # We expect to see a single file purge
+        self.wait_until_true(
+            lambda: self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']['strays_enqueued'] - initial_purges >= 2,
+            60)
+
+        # We expect two deletions, one of the dirfrag and one of the backtrace
+        self.wait_until_true(
+            lambda: self.fs.mds_asok(['perf', 'dump', 'objecter'])['objecter']['osdop_delete'] - initial_dels >= 2,
+            60)  # timeout is fairly long to allow for tick+rados latencies
+
+        with self.assertRaises(ObjectNotFound):
+            self.fs.list_dirfrag(dir_ino)
+        with self.assertRaises(ObjectNotFound):
+            self.fs.read_backtrace(file_ino)
+        self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
diff --git a/qa/tasks/cephfs/test_forward_scrub.py b/qa/tasks/cephfs/test_forward_scrub.py
new file mode 100644
index 000000000..82630e069
--- /dev/null
+++ b/qa/tasks/cephfs/test_forward_scrub.py
@@ -0,0 +1,300 @@
+
+"""
+Test that the forward scrub functionality can traverse metadata and apply
+requested tags, on well formed metadata.
+
+This is *not* the real testing for forward scrub, which will need to test
+how the functionality responds to damaged metadata.
+
+"""
+import logging
+import json
+
+from collections import namedtuple
+from io import BytesIO
+from textwrap import dedent
+
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+import struct
+
+log = logging.getLogger(__name__)
+
+
+ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
+
+
+class TestForwardScrub(CephFSTestCase):
+    MDSS_REQUIRED = 1
+
+    def _read_str_xattr(self, pool, obj, attr):
+        """
+        Read a ceph-encoded string from a rados xattr
+        """
+        output = self.fs.mon_manager.do_rados(["getxattr", obj, attr], pool=pool,
+                               stdout=BytesIO()).stdout.getvalue()
+        strlen = struct.unpack('i', output[0:4])[0]
+        return output[4:(4 + strlen)].decode(encoding='ascii')
+
+    def _get_paths_to_ino(self):
+        inos = {}
+        p = self.mount_a.run_shell(["find", "./"])
+        paths = p.stdout.getvalue().strip().split()
+        for path in paths:
+            inos[path] = self.mount_a.path_to_ino(path)
+
+        return inos
+
+    def test_apply_tag(self):
+        self.mount_a.run_shell(["mkdir", "parentdir"])
+        self.mount_a.run_shell(["mkdir", "parentdir/childdir"])
+        self.mount_a.run_shell(["touch", "rfile"])
+        self.mount_a.run_shell(["touch", "parentdir/pfile"])
+        self.mount_a.run_shell(["touch", "parentdir/childdir/cfile"])
+
+        # Build a structure mapping path to inode, as we will later want
+        # to check object by object and objects are named after ino number
+        inos = self._get_paths_to_ino()
+
+        # Flush metadata: this is a friendly test of forward scrub so we're skipping
+        # the part where it's meant to cope with dirty metadata
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(["flush", "journal"])
+
+        tag = "mytag"
+
+        # Execute tagging forward scrub
+        self.fs.mds_asok(["tag", "path", "/parentdir", tag])
+        # Wait for completion
+        import time
+        time.sleep(10)
+        # FIXME watching clog isn't a nice mechanism for this, once we have a ScrubMap we'll
+        # watch that instead
+
+        # Check that dirs were tagged
+        for dirpath in ["./parentdir", "./parentdir/childdir"]:
+            self.assertTagged(inos[dirpath], tag, self.fs.get_metadata_pool_name())
+
+        # Check that files were tagged
+        for filepath in ["./parentdir/pfile", "./parentdir/childdir/cfile"]:
+            self.assertTagged(inos[filepath], tag, self.fs.get_data_pool_name())
+
+        # This guy wasn't in the tag path, shouldn't have been tagged
+        self.assertUntagged(inos["./rfile"])
+
+    def assertUntagged(self, ino):
+        file_obj_name = "{0:x}.00000000".format(ino)
+        with self.assertRaises(CommandFailedError):
+            self._read_str_xattr(
+                self.fs.get_data_pool_name(),
+                file_obj_name,
+                "scrub_tag"
+            )
+
+    def assertTagged(self, ino, tag, pool):
+        file_obj_name = "{0:x}.00000000".format(ino)
+        wrote = self._read_str_xattr(
+            pool,
+            file_obj_name,
+            "scrub_tag"
+        )
+        self.assertEqual(wrote, tag)
+
+    def _validate_linkage(self, expected):
+        inos = self._get_paths_to_ino()
+        try:
+            self.assertDictEqual(inos, expected)
+        except AssertionError:
+            log.error("Expected: {0}".format(json.dumps(expected, indent=2)))
+            log.error("Actual: {0}".format(json.dumps(inos, indent=2)))
+            raise
+
+    def test_orphan_scan(self):
+        # Create some files whose metadata we will flush
+        self.mount_a.run_python(dedent("""
+            import os
+            mount_point = "{mount_point}"
+            parent = os.path.join(mount_point, "parent")
+            os.mkdir(parent)
+            flushed = os.path.join(parent, "flushed")
+            os.mkdir(flushed)
+            for f in ["alpha", "bravo", "charlie"]:
+                open(os.path.join(flushed, f), 'w').write(f)
+        """.format(mount_point=self.mount_a.mountpoint)))
+
+        inos = self._get_paths_to_ino()
+
+        # Flush journal
+        # Umount before flush to avoid cap releases putting
+        # things we don't want in the journal later.
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(["flush", "journal"])
+
+        # Create a new inode that's just in the log, i.e. would
+        # look orphaned to backward scan if backward scan wisnae
+        # respectin' tha scrub_tag xattr.
+        self.mount_a.mount_wait()
+        self.mount_a.run_shell(["mkdir", "parent/unflushed"])
+        self.mount_a.run_shell(["dd", "if=/dev/urandom",
+                                "of=./parent/unflushed/jfile",
+                                "bs=1M", "count=8"])
+        inos["./parent/unflushed"] = self.mount_a.path_to_ino("./parent/unflushed")
+        inos["./parent/unflushed/jfile"] = self.mount_a.path_to_ino("./parent/unflushed/jfile")
+        self.mount_a.umount_wait()
+
+        # Orphan an inode by deleting its dentry
+        # Our victim will be.... bravo.
+        self.mount_a.umount_wait()
+        self.fs.fail()
+        self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
+        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
+        frag_obj_id = "{0:x}.00000000".format(inos["./parent/flushed"])
+        self.fs.radosm(["rmomapkey", frag_obj_id, "bravo_head"])
+
+        self.fs.set_joinable()
+        self.fs.wait_for_daemons()
+
+        # See that the orphaned file is indeed missing from a client's POV
+        self.mount_a.mount_wait()
+        damaged_state = self._get_paths_to_ino()
+        self.assertNotIn("./parent/flushed/bravo", damaged_state)
+        self.mount_a.umount_wait()
+
+        # Run a tagging forward scrub
+        tag = "mytag123"
+        self.fs.mds_asok(["tag", "path", "/parent", tag])
+
+        # See that the orphan wisnae tagged
+        self.assertUntagged(inos['./parent/flushed/bravo'])
+
+        # See that the flushed-metadata-and-still-present files are tagged
+        self.assertTagged(inos['./parent/flushed/alpha'], tag, self.fs.get_data_pool_name())
+        self.assertTagged(inos['./parent/flushed/charlie'], tag, self.fs.get_data_pool_name())
+
+        # See that journalled-but-not-flushed file *was* tagged
+        self.assertTagged(inos['./parent/unflushed/jfile'], tag, self.fs.get_data_pool_name())
+
+        # Run cephfs-data-scan targeting only orphans
+        self.fs.fail()
+        self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
+        self.fs.data_scan([
+            "scan_inodes",
+            "--filter-tag", tag,
+            self.fs.get_data_pool_name()
+        ])
+
+        # After in-place injection stats should be kosher again
+        self.fs.set_ceph_conf('mds', 'mds verify scatter', True)
+        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', True)
+
+        # And we should have all the same linkage we started with,
+        # and no lost+found, and no extra inodes!
+        self.fs.set_joinable()
+        self.fs.wait_for_daemons()
+        self.mount_a.mount_wait()
+        self._validate_linkage(inos)
+
+    def _stash_inotable(self):
+        # Get all active ranks
+        ranks = self.fs.get_all_mds_rank()
+
+        inotable_dict = {}
+        for rank in ranks:
+            inotable_oid = "mds{rank:d}_".format(rank=rank) + "inotable"
+            print("Trying to fetch inotable object: " + inotable_oid)
+
+            #self.fs.get_metadata_object("InoTable", "mds0_inotable")
+            inotable_raw = self.fs.radosmo(['get', inotable_oid, '-'])
+            inotable_dict[inotable_oid] = inotable_raw
+        return inotable_dict
+
+    def test_inotable_sync(self):
+        self.mount_a.write_n_mb("file1_sixmegs", 6)
+
+        # Flush journal
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(["flush", "journal"])
+
+        inotable_copy = self._stash_inotable()
+
+        self.mount_a.mount_wait()
+
+        self.mount_a.write_n_mb("file2_sixmegs", 6)
+        self.mount_a.write_n_mb("file3_sixmegs", 6)
+
+        inos = self._get_paths_to_ino()
+
+        # Flush journal
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(["flush", "journal"])
+
+        self.mount_a.umount_wait()
+
+        with self.assert_cluster_log("inode table repaired", invert_match=True):
+            out_json = self.fs.run_scrub(["start", "/", "repair,recursive"])
+            self.assertNotEqual(out_json, None)
+            self.assertEqual(out_json["return_code"], 0)
+            self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
+
+        self.fs.fail()
+
+        # Truncate the journal (to ensure the inotable on disk
+        # is all that will be in the InoTable in memory)
+
+        self.fs.journal_tool(["event", "splice",
+                              "--inode={0}".format(inos["./file2_sixmegs"]), "summary"], 0)
+
+        self.fs.journal_tool(["event", "splice",
+                              "--inode={0}".format(inos["./file3_sixmegs"]), "summary"], 0)
+
+        # Revert to old inotable.
+        for key, value in inotable_copy.items():
+            self.fs.radosm(["put", key, "-"], stdin=BytesIO(value))
+
+        self.fs.set_joinable()
+        self.fs.wait_for_daemons()
+
+        with self.assert_cluster_log("inode table repaired"):
+            out_json = self.fs.run_scrub(["start", "/", "repair,recursive"])
+            self.assertNotEqual(out_json, None)
+            self.assertEqual(out_json["return_code"], 0)
+            self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
+
+        self.fs.fail()
+        table_text = self.fs.table_tool(["0", "show", "inode"])
+        table = json.loads(table_text)
+        self.assertGreater(
+                table['0']['data']['inotable']['free'][0]['start'],
+                inos['./file3_sixmegs'])
+
+    def test_backtrace_repair(self):
+        """
+        That the MDS can repair an inodes backtrace in the data pool
+        if it is found to be damaged.
+        """
+        # Create a file for subsequent checks
+        self.mount_a.run_shell(["mkdir", "parent_a"])
+        self.mount_a.run_shell(["touch", "parent_a/alpha"])
+        file_ino = self.mount_a.path_to_ino("parent_a/alpha")
+
+        # That backtrace and layout are written after initial flush
+        self.fs.mds_asok(["flush", "journal"])
+        backtrace = self.fs.read_backtrace(file_ino)
+        self.assertEqual(['alpha', 'parent_a'],
+                         [a['dname'] for a in backtrace['ancestors']])
+
+        # Go corrupt the backtrace
+        self.fs._write_data_xattr(file_ino, "parent",
+                                  "oh i'm sorry did i overwrite your xattr?")
+
+        with self.assert_cluster_log("bad backtrace on inode"):
+            out_json = self.fs.run_scrub(["start", "/", "repair,recursive"])
+            self.assertNotEqual(out_json, None)
+            self.assertEqual(out_json["return_code"], 0)
+            self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
+
+        self.fs.mds_asok(["flush", "journal"])
+        backtrace = self.fs.read_backtrace(file_ino)
+        self.assertEqual(['alpha', 'parent_a'],
+                         [a['dname'] for a in backtrace['ancestors']])
diff --git a/qa/tasks/cephfs/test_fragment.py b/qa/tasks/cephfs/test_fragment.py
new file mode 100644
index 000000000..1102f887b
--- /dev/null
+++ b/qa/tasks/cephfs/test_fragment.py
@@ -0,0 +1,319 @@
+from io import StringIO
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.orchestra import run
+
+import os
+import time
+import logging
+log = logging.getLogger(__name__)
+
+
+class TestFragmentation(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 1
+
+    def get_splits(self):
+        return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_split']
+
+    def get_merges(self):
+        return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_merge']
+
+    def get_dir_ino(self, path):
+        dir_cache = self.fs.read_cache(path, 0)
+        dir_ino = None
+        dir_inono = self.mount_a.path_to_ino(path.strip("/"))
+        for ino in dir_cache:
+            if ino['ino'] == dir_inono:
+                dir_ino = ino
+                break
+        self.assertIsNotNone(dir_ino)
+        return dir_ino
+
+    def _configure(self, **kwargs):
+        """
+        Apply kwargs as MDS configuration settings, enable dirfrags
+        and restart the MDSs.
+        """
+
+        for k, v in kwargs.items():
+            self.ceph_cluster.set_ceph_conf("mds", k, v.__str__())
+
+        self.mds_cluster.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+    def test_oversize(self):
+        """
+        That a directory is split when it becomes too large.
+        """
+
+        split_size = 20
+        merge_size = 5
+
+        self._configure(
+            mds_bal_split_size=split_size,
+            mds_bal_merge_size=merge_size,
+            mds_bal_split_bits=1
+        )
+
+        self.assertEqual(self.get_splits(), 0)
+
+        self.mount_a.create_n_files("splitdir/file", split_size + 1)
+
+        self.wait_until_true(
+            lambda: self.get_splits() == 1,
+            timeout=30
+        )
+
+        frags = self.get_dir_ino("/splitdir")['dirfrags']
+        self.assertEqual(len(frags), 2)
+        self.assertEqual(frags[0]['dirfrag'], "0x10000000000.0*")
+        self.assertEqual(frags[1]['dirfrag'], "0x10000000000.1*")
+        self.assertEqual(
+            sum([len(f['dentries']) for f in frags]),
+            split_size + 1
+        )
+
+        self.assertEqual(self.get_merges(), 0)
+
+        self.mount_a.run_shell(["rm", "-f", run.Raw("splitdir/file*")])
+
+        self.wait_until_true(
+            lambda: self.get_merges() == 1,
+            timeout=30
+        )
+
+        self.assertEqual(len(self.get_dir_ino("/splitdir")["dirfrags"]), 1)
+
+    def test_rapid_creation(self):
+        """
+        That the fast-splitting limit of 1.5x normal limit is
+        applied when creating dentries quickly.
+        """
+
+        split_size = 100
+        merge_size = 1
+
+        self._configure(
+            mds_bal_split_size=split_size,
+            mds_bal_merge_size=merge_size,
+            mds_bal_split_bits=3,
+            mds_bal_fragment_size_max=int(split_size * 1.5 + 2)
+        )
+
+        # We test this only at a single split level.  If a client was sending
+        # IO so fast that it hit a second split before the first split
+        # was complete, it could violate mds_bal_fragment_size_max -- there
+        # is a window where the child dirfrags of a split are unfrozen
+        # (so they can grow), but still have STATE_FRAGMENTING (so they
+        # can't be split).
+
+        # By writing 4x the split size when the split bits are set
+        # to 3 (i.e. 4-ways), I am reasonably sure to see precisely
+        # one split.  The test is to check whether that split
+        # happens soon enough that the client doesn't exceed
+        # 2x the split_size (the "immediate" split mode should
+        # kick in at 1.5x the split size).
+
+        self.assertEqual(self.get_splits(), 0)
+        self.mount_a.create_n_files("splitdir/file", split_size * 4)
+        self.wait_until_equal(
+            self.get_splits,
+            1,
+            reject_fn=lambda s: s > 1,
+            timeout=30
+        )
+
+    def test_deep_split(self):
+        """
+        That when the directory grows many times larger than split size,
+        the fragments get split again.
+        """
+
+        split_size = 100
+        merge_size = 1  # i.e. don't merge frag unless its empty
+        split_bits = 1
+
+        branch_factor = 2**split_bits
+
+        # Arbitrary: how many levels shall we try fragmenting before
+        # ending the test?
+        max_depth = 5
+
+        self._configure(
+            mds_bal_split_size=split_size,
+            mds_bal_merge_size=merge_size,
+            mds_bal_split_bits=split_bits
+        )
+
+        # Each iteration we will create another level of fragments.  The
+        # placement of dentries into fragments is by hashes (i.e. pseudo
+        # random), so we rely on statistics to get the behaviour that
+        # by writing about 1.5x as many dentries as the split_size times
+        # the number of frags, we will get them all to exceed their
+        # split size and trigger a split.
+        depth = 0
+        files_written = 0
+        splits_expected = 0
+        while depth < max_depth:
+            log.info("Writing files for depth {0}".format(depth))
+            target_files = branch_factor**depth * int(split_size * 1.5)
+            create_files = target_files - files_written
+
+            self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
+                "{0} Writing {1} files (depth={2})".format(
+                    self.__class__.__name__, create_files, depth
+                ))
+            self.mount_a.create_n_files("splitdir/file_{0}".format(depth),
+                                        create_files)
+            self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
+                "{0} Done".format(self.__class__.__name__))
+
+            files_written += create_files
+            log.info("Now have {0} files".format(files_written))
+
+            splits_expected += branch_factor**depth
+            log.info("Waiting to see {0} splits".format(splits_expected))
+            try:
+                self.wait_until_equal(
+                    self.get_splits,
+                    splits_expected,
+                    timeout=30,
+                    reject_fn=lambda x: x > splits_expected
+                )
+
+                frags = self.get_dir_ino("/splitdir")['dirfrags']
+                self.assertEqual(len(frags), branch_factor**(depth+1))
+                self.assertEqual(
+                    sum([len(f['dentries']) for f in frags]),
+                    target_files
+                )
+            except:
+                # On failures, log what fragmentation we actually ended
+                # up with.  This block is just for logging, at the end
+                # we raise the exception again.
+                frags = self.get_dir_ino("/splitdir")['dirfrags']
+                log.info("depth={0} splits_expected={1} files_written={2}".format(
+                    depth, splits_expected, files_written
+                ))
+                log.info("Dirfrags:")
+                for f in frags:
+                    log.info("{0}: {1}".format(
+                        f['dirfrag'], len(f['dentries'])
+                    ))
+                raise
+
+            depth += 1
+
+        # Remember the inode number because we will be checking for
+        # objects later.
+        dir_inode_no = self.mount_a.path_to_ino("splitdir")
+
+        self.mount_a.run_shell(["rm", "-rf", "splitdir/"])
+        self.mount_a.umount_wait()
+
+        self.fs.mds_asok(['flush', 'journal'])
+
+        def _check_pq_finished():
+            num_strays = self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']['num_strays']
+            pq_ops = self.fs.mds_asok(['perf', 'dump', 'purge_queue'])['purge_queue']['pq_executing']
+            return num_strays == 0 and pq_ops == 0
+
+        # Wait for all strays to purge
+        self.wait_until_true(
+            lambda: _check_pq_finished(),
+            timeout=1200
+        )
+        # Check that the metadata pool objects for all the myriad
+        # child fragments are gone
+        metadata_objs = self.fs.radosmo(["ls"], stdout=StringIO()).strip()
+        frag_objs = []
+        for o in metadata_objs.split("\n"):
+            if o.startswith("{0:x}.".format(dir_inode_no)):
+                frag_objs.append(o)
+        self.assertListEqual(frag_objs, [])
+
+    def test_split_straydir(self):
+        """
+        That stray dir is split when it becomes too large.
+        """
+        def _count_fragmented():
+            mdsdir_cache = self.fs.read_cache("~mdsdir", 1)
+            num = 0
+            for ino in mdsdir_cache:
+                if ino["ino"] == 0x100:
+                    continue
+                if len(ino["dirfrags"]) > 1:
+                    log.info("straydir 0x{:X} is fragmented".format(ino["ino"]))
+                    num += 1;
+            return num
+
+        split_size = 50
+        merge_size = 5
+        split_bits = 1
+
+        self._configure(
+            mds_bal_split_size=split_size,
+            mds_bal_merge_size=merge_size,
+            mds_bal_split_bits=split_bits,
+            mds_bal_fragment_size_max=(split_size * 100)
+        )
+
+        # manually split/merge
+        self.assertEqual(_count_fragmented(), 0)
+        self.fs.mds_asok(["dirfrag", "split", "~mdsdir/stray8", "0/0", "1"])
+        self.fs.mds_asok(["dirfrag", "split", "~mdsdir/stray9", "0/0", "1"])
+        self.wait_until_true(
+            lambda: _count_fragmented() == 2,
+            timeout=30
+        )
+
+        time.sleep(30)
+
+        self.fs.mds_asok(["dirfrag", "merge", "~mdsdir/stray8", "0/0"])
+        self.wait_until_true(
+            lambda: _count_fragmented() == 1,
+            timeout=30
+        )
+
+        time.sleep(30)
+
+        # auto merge
+
+        # merging stray dirs is driven by MDCache::advance_stray()
+        # advance stray dir 10 times
+        for _ in range(10):
+            self.fs.mds_asok(['flush', 'journal'])
+
+        self.wait_until_true(
+            lambda: _count_fragmented() == 0,
+            timeout=30
+        )
+
+        # auto split
+
+        # there are 10 stray dirs. advance stray dir 20 times
+        self.mount_a.create_n_files("testdir1/file", split_size * 20)
+        self.mount_a.run_shell(["mkdir", "testdir2"])
+        testdir1_path = os.path.join(self.mount_a.mountpoint, "testdir1")
+        for i in self.mount_a.ls(testdir1_path):
+            self.mount_a.run_shell(["ln", "testdir1/{0}".format(i), "testdir2/"])
+
+        self.mount_a.umount_wait()
+        self.mount_a.mount_wait()
+        self.mount_a.wait_until_mounted()
+
+        # flush journal and restart mds. after restart, testdir2 is not in mds' cache
+        self.fs.mds_asok(['flush', 'journal'])
+        self.mds_cluster.mds_fail_restart()
+        self.fs.wait_for_daemons()
+        # splitting stray dirs is driven by MDCache::advance_stray()
+        # advance stray dir after unlink 'split_size' files.
+        self.fs.mds_asok(['config', 'set', 'mds_log_events_per_segment', str(split_size)])
+
+        self.assertEqual(_count_fragmented(), 0)
+        self.mount_a.run_shell(["rm", "-rf", "testdir1"])
+        self.wait_until_true(
+            lambda: _count_fragmented() > 0,
+            timeout=30
+        )
diff --git a/qa/tasks/cephfs/test_fstop.py b/qa/tasks/cephfs/test_fstop.py
new file mode 100644
index 000000000..08617807e
--- /dev/null
+++ b/qa/tasks/cephfs/test_fstop.py
@@ -0,0 +1,27 @@
+import logging
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.exceptions import CommandFailedError
+
+log = logging.getLogger(__name__)
+
+class TestFSTop(CephFSTestCase):
+    def test_fstop_non_existent_cluster(self):
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable", "stats")
+        try:
+            self.mount_a.run_shell(['cephfs-top',
+                                    '--cluster=hpec',
+                                    '--id=admin',
+                                    '--selftest'])
+        except CommandFailedError:
+            pass
+        else:
+            raise RuntimeError('expected cephfs-top command to fail.')
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "disable", "stats")
+
+    def test_fstop(self):
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable", "stats")
+        self.mount_a.run_shell(['cephfs-top',
+                                '--id=admin',
+                                '--selftest'])
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "disable", "stats")
diff --git a/qa/tasks/cephfs/test_full.py b/qa/tasks/cephfs/test_full.py
new file mode 100644
index 000000000..8282b4e0c
--- /dev/null
+++ b/qa/tasks/cephfs/test_full.py
@@ -0,0 +1,398 @@
+import json
+import logging
+import os
+from textwrap import dedent
+from typing import Optional
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+
+log = logging.getLogger(__name__)
+
+
+class FullnessTestCase(CephFSTestCase):
+    CLIENTS_REQUIRED = 2
+
+    # Subclasses define whether they're filling whole cluster or just data pool
+    data_only = False
+
+    # Subclasses define how many bytes should be written to achieve fullness
+    pool_capacity: Optional[int] = None
+    fill_mb = None
+
+    def is_full(self):
+        return self.fs.is_full()
+
+    def setUp(self):
+        CephFSTestCase.setUp(self)
+
+        mds_status = self.fs.rank_asok(["status"])
+
+        # Capture the initial OSD map epoch for later use
+        self.initial_osd_epoch = mds_status['osdmap_epoch_barrier']
+
+    def test_barrier(self):
+        """
+        That when an OSD epoch barrier is set on an MDS, subsequently
+        issued capabilities cause clients to update their OSD map to that
+        epoch.
+        """
+
+        # script that sync up client with MDS OSD map barrier. The barrier should
+        # be updated by cap flush ack message.
+        pyscript = dedent("""
+            import os
+            fd = os.open("{path}", os.O_CREAT | os.O_RDWR, 0O600)
+            os.fchmod(fd, 0O666)
+            os.fsync(fd)
+            os.close(fd)
+            """)
+
+        # Sync up client with initial MDS OSD map barrier.
+        path = os.path.join(self.mount_a.mountpoint, "foo")
+        self.mount_a.run_python(pyscript.format(path=path))
+
+        # Grab mounts' initial OSD epochs: later we will check that
+        # it hasn't advanced beyond this point.
+        mount_a_initial_epoch, mount_a_initial_barrier = self.mount_a.get_osd_epoch()
+
+        # Freshly mounted at start of test, should be up to date with OSD map
+        self.assertGreaterEqual(mount_a_initial_epoch, self.initial_osd_epoch)
+
+        # Set and unset a flag to cause OSD epoch to increment
+        self.fs.mon_manager.raw_cluster_cmd("osd", "set", "pause")
+        self.fs.mon_manager.raw_cluster_cmd("osd", "unset", "pause")
+
+        out = self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json").strip()
+        new_epoch = json.loads(out)['epoch']
+        self.assertNotEqual(self.initial_osd_epoch, new_epoch)
+
+        # Do a metadata operation on clients, witness that they end up with
+        # the old OSD map from startup time (nothing has prompted client
+        # to update its map)
+        path = os.path.join(self.mount_a.mountpoint, "foo")
+        self.mount_a.run_python(pyscript.format(path=path))
+        mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch()
+        self.assertEqual(mount_a_epoch, mount_a_initial_epoch)
+        self.assertEqual(mount_a_barrier, mount_a_initial_barrier)
+
+        # Set a barrier on the MDS
+        self.fs.rank_asok(["osdmap", "barrier", new_epoch.__str__()])
+
+        # Sync up client with new MDS OSD map barrier
+        path = os.path.join(self.mount_a.mountpoint, "baz")
+        self.mount_a.run_python(pyscript.format(path=path))
+        mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch()
+        self.assertEqual(mount_a_barrier, new_epoch)
+
+        # Some time passes here because the metadata part of the operation
+        # completes immediately, while the resulting OSD map update happens
+        # asynchronously (it's an Objecter::_maybe_request_map) as a result
+        # of seeing the new epoch barrier.
+        self.wait_until_true(
+            lambda: self.mount_a.get_osd_epoch()[0] >= new_epoch,
+            timeout=30)
+
+    def _data_pool_name(self):
+        data_pool_names = self.fs.get_data_pool_names()
+        if len(data_pool_names) > 1:
+            raise RuntimeError("This test can't handle multiple data pools")
+        else:
+            return data_pool_names[0]
+
+    def _test_full(self, easy_case):
+        """
+        - That a client trying to write data to a file is prevented
+        from doing so with an -EFULL result
+        - That they are also prevented from creating new files by the MDS.
+        - That they may delete another file to get the system healthy again
+
+        :param easy_case: if true, delete a successfully written file to
+                          free up space.  else, delete the file that experienced
+                          the failed write.
+        """
+
+        osd_mon_report_interval = int(self.fs.get_config("osd_mon_report_interval", service_type='osd'))
+
+        log.info("Writing {0}MB should fill this cluster".format(self.fill_mb))
+
+        # Fill up the cluster.  This dd may or may not fail, as it depends on
+        # how soon the cluster recognises its own fullness
+        self.mount_a.write_n_mb("large_file_a", self.fill_mb // 2)
+        try:
+            self.mount_a.write_n_mb("large_file_b", (self.fill_mb * 1.1) // 2)
+        except CommandFailedError:
+            log.info("Writing file B failed (full status happened already)")
+            assert self.is_full()
+        else:
+            log.info("Writing file B succeeded (full status will happen soon)")
+            self.wait_until_true(lambda: self.is_full(),
+                                 timeout=osd_mon_report_interval * 120)
+
+        # Attempting to write more data should give me ENOSPC
+        with self.assertRaises(CommandFailedError) as ar:
+            self.mount_a.write_n_mb("large_file_b", 50, seek=self.fill_mb // 2)
+        self.assertEqual(ar.exception.exitstatus, 1)  # dd returns 1 on "No space"
+
+        # Wait for the MDS to see the latest OSD map so that it will reliably
+        # be applying the policy of rejecting non-deletion metadata operations
+        # while in the full state.
+        osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
+        self.wait_until_true(
+            lambda: self.fs.rank_asok(['status'])['osdmap_epoch'] >= osd_epoch,
+            timeout=10)
+
+        if not self.data_only:
+            with self.assertRaises(CommandFailedError):
+                self.mount_a.write_n_mb("small_file_1", 0)
+
+        # Clear out some space
+        if easy_case:
+            self.mount_a.run_shell(['rm', '-f', 'large_file_a'])
+            self.mount_a.run_shell(['rm', '-f', 'large_file_b'])
+        else:
+            # In the hard case it is the file that filled the system.
+            # Before the new #7317 (ENOSPC, epoch barrier) changes, this
+            # would fail because the last objects written would be
+            # stuck in the client cache as objecter operations.
+            self.mount_a.run_shell(['rm', '-f', 'large_file_b'])
+            self.mount_a.run_shell(['rm', '-f', 'large_file_a'])
+
+        # Here we are waiting for two things to happen:
+        # * The MDS to purge the stray folder and execute object deletions
+        #  * The OSDs to inform the mon that they are no longer full
+        self.wait_until_true(lambda: not self.is_full(),
+                             timeout=osd_mon_report_interval * 120)
+
+        # Wait for the MDS to see the latest OSD map so that it will reliably
+        # be applying the free space policy
+        osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
+        self.wait_until_true(
+            lambda: self.fs.rank_asok(['status'])['osdmap_epoch'] >= osd_epoch,
+            timeout=10)
+
+        # Now I should be able to write again
+        self.mount_a.write_n_mb("large_file", 50, seek=0)
+
+        # Ensure that the MDS keeps its OSD epoch barrier across a restart
+
+    def test_full_different_file(self):
+        self._test_full(True)
+
+    def test_full_same_file(self):
+        self._test_full(False)
+
+    def _remote_write_test(self, template):
+        """
+        Run some remote python in a way that's useful for
+        testing free space behaviour (see test_* methods using this)
+        """
+        file_path = os.path.join(self.mount_a.mountpoint, "full_test_file")
+
+        # Enough to trip the full flag
+        osd_mon_report_interval = int(self.fs.get_config("osd_mon_report_interval", service_type='osd'))
+        mon_tick_interval = int(self.fs.get_config("mon_tick_interval", service_type="mon"))
+
+        # Sufficient data to cause RADOS cluster to go 'full'
+        log.info("pool capacity {0}, {1}MB should be enough to fill it".format(self.pool_capacity, self.fill_mb))
+
+        # Long enough for RADOS cluster to notice it is full and set flag on mons
+        # (report_interval for mon to learn PG stats, tick interval for it to update OSD map,
+        #  factor of 1.5 for I/O + network latency in committing OSD map and distributing it
+        #  to the OSDs)
+        full_wait = (osd_mon_report_interval + mon_tick_interval) * 1.5
+
+        # Configs for this test should bring this setting down in order to
+        # run reasonably quickly
+        if osd_mon_report_interval > 10:
+            log.warning("This test may run rather slowly unless you decrease"
+                     "osd_mon_report_interval (5 is a good setting)!")
+
+        # set the object_size to 1MB to make the objects destributed more evenly
+        # among the OSDs to fix Tracker#45434
+        file_layout = "stripe_unit=1048576 stripe_count=1 object_size=1048576"
+        self.mount_a.run_python(template.format(
+            fill_mb=self.fill_mb,
+            file_path=file_path,
+            file_layout=file_layout,
+            full_wait=full_wait,
+            is_fuse=isinstance(self.mount_a, FuseMount)
+        ))
+
+    def test_full_fclose(self):
+        # A remote script which opens a file handle, fills up the filesystem, and then
+        # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
+        remote_script = dedent("""
+            import time
+            import datetime
+            import subprocess
+            import os
+
+            # Write some buffered data through before going full, all should be well
+            print("writing some data through which we expect to succeed")
+            bytes = 0
+            f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
+            os.setxattr("{file_path}", 'ceph.file.layout', b'{file_layout}')
+            bytes += os.write(f, b'a' * 512 * 1024)
+            os.fsync(f)
+            print("fsync'ed data successfully, will now attempt to fill fs")
+
+            # Okay, now we're going to fill up the filesystem, and then keep
+            # writing until we see an error from fsync.  As long as we're doing
+            # buffered IO, the error should always only appear from fsync and not
+            # from write
+            full = False
+
+            for n in range(0, int({fill_mb} * 0.9)):
+                bytes += os.write(f, b'x' * 1024 * 1024)
+                print("wrote {{0}} bytes via buffered write, may repeat".format(bytes))
+            print("done writing {{0}} bytes".format(bytes))
+
+            # OK, now we should sneak in under the full condition
+            # due to the time it takes the OSDs to report to the
+            # mons, and get a successful fsync on our full-making data
+            os.fsync(f)
+            print("successfully fsync'ed prior to getting full state reported")
+
+            # buffered write, add more dirty data to the buffer
+            print("starting buffered write")
+            try:
+                for n in range(0, int({fill_mb} * 0.2)):
+                    bytes += os.write(f, b'x' * 1024 * 1024)
+                    print("sleeping a bit as we've exceeded 90% of our expected full ratio")
+                    time.sleep({full_wait})
+            except OSError:
+                pass;
+
+            print("wrote, now waiting 30s and then doing a close we expect to fail")
+
+            # Wait long enough for a background flush that should fail
+            time.sleep(30)
+
+            if {is_fuse}:
+                # ...and check that the failed background flush is reflected in fclose
+                try:
+                    os.close(f)
+                except OSError:
+                    print("close() returned an error as expected")
+                else:
+                    raise RuntimeError("close() failed to raise error")
+            else:
+                # The kernel cephfs client does not raise errors on fclose
+                os.close(f)
+
+            os.unlink("{file_path}")
+            """)
+        self._remote_write_test(remote_script)
+
+    def test_full_fsync(self):
+        """
+        That when the full flag is encountered during asynchronous
+        flushes, such that an fwrite() succeeds but an fsync/fclose()
+        should return the ENOSPC error.
+        """
+
+        # A remote script which opens a file handle, fills up the filesystem, and then
+        # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
+        remote_script = dedent("""
+            import time
+            import datetime
+            import subprocess
+            import os
+
+            # Write some buffered data through before going full, all should be well
+            print("writing some data through which we expect to succeed")
+            bytes = 0
+            f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
+            os.setxattr("{file_path}", 'ceph.file.layout', b'{file_layout}')
+            bytes += os.write(f, b'a' * 4096)
+            os.fsync(f)
+            print("fsync'ed data successfully, will now attempt to fill fs")
+
+            # Okay, now we're going to fill up the filesystem, and then keep
+            # writing until we see an error from fsync.  As long as we're doing
+            # buffered IO, the error should always only appear from fsync and not
+            # from write
+            full = False
+
+            for n in range(0, int({fill_mb} * 1.1)):
+                try:
+                    bytes += os.write(f, b'x' * 1024 * 1024)
+                    print("wrote bytes via buffered write, moving on to fsync")
+                except OSError as e:
+                    if {is_fuse}:
+                        print("Unexpected error %s from write() instead of fsync()" % e)
+                        raise
+                    else:
+                        print("Reached fullness after %.2f MB" % (bytes / (1024.0 * 1024.0)))
+                        full = True
+                        break
+
+                try:
+                    os.fsync(f)
+                    print("fsync'ed successfully")
+                except OSError as e:
+                    print("Reached fullness after %.2f MB" % (bytes / (1024.0 * 1024.0)))
+                    full = True
+                    break
+                else:
+                    print("Not full yet after %.2f MB" % (bytes / (1024.0 * 1024.0)))
+
+                if n > {fill_mb} * 0.9:
+                    # Be cautious in the last region where we expect to hit
+                    # the full condition, so that we don't overshoot too dramatically
+                    print("sleeping a bit as we've exceeded 90% of our expected full ratio")
+                    time.sleep({full_wait})
+
+            if not full:
+                raise RuntimeError("Failed to reach fullness after writing %d bytes" % bytes)
+
+            # close() should not raise an error because we already caught it in
+            # fsync.  There shouldn't have been any more writeback errors
+            # since then because all IOs got cancelled on the full flag.
+            print("calling close")
+            os.close(f)
+            print("close() did not raise error")
+
+            os.unlink("{file_path}")
+            """)
+
+        self._remote_write_test(remote_script)
+
+
+class TestQuotaFull(FullnessTestCase):
+    """
+    Test per-pool fullness, which indicates quota limits exceeded
+    """
+    pool_capacity = 1024 * 1024 * 32  # arbitrary low-ish limit
+    fill_mb = pool_capacity // (1024 * 1024)  # type: ignore
+
+    # We are only testing quota handling on the data pool, not the metadata
+    # pool.
+    data_only = True
+
+    def setUp(self):
+        super(TestQuotaFull, self).setUp()
+
+        pool_name = self.fs.get_data_pool_name()
+        self.fs.mon_manager.raw_cluster_cmd("osd", "pool", "set-quota", pool_name,
+                                            "max_bytes", "{0}".format(self.pool_capacity))
+
+
+class TestClusterFull(FullnessTestCase):
+    """
+    Test data pool fullness, which indicates that an OSD has become too full
+    """
+    pool_capacity = None
+    REQUIRE_MEMSTORE = True
+
+    def setUp(self):
+        super(TestClusterFull, self).setUp()
+
+        if self.pool_capacity is None:
+            TestClusterFull.pool_capacity = self.fs.get_pool_df(self._data_pool_name())['max_avail']
+            TestClusterFull.fill_mb = (self.pool_capacity // (1024 * 1024))
+
+# Hide the parent class so that unittest.loader doesn't try to run it.
+del globals()['FullnessTestCase']
diff --git a/qa/tasks/cephfs/test_journal_migration.py b/qa/tasks/cephfs/test_journal_migration.py
new file mode 100644
index 000000000..67b514c22
--- /dev/null
+++ b/qa/tasks/cephfs/test_journal_migration.py
@@ -0,0 +1,100 @@
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from tasks.workunit import task as workunit
+
+JOURNAL_FORMAT_LEGACY = 0
+JOURNAL_FORMAT_RESILIENT = 1
+
+
+class TestJournalMigration(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 2
+
+    def test_journal_migration(self):
+        old_journal_version = JOURNAL_FORMAT_LEGACY
+        new_journal_version = JOURNAL_FORMAT_RESILIENT
+
+        self.mount_a.umount_wait()
+        self.fs.mds_stop()
+
+        # Create a filesystem using the older journal format.
+        self.fs.set_ceph_conf('mds', 'mds journal format', old_journal_version)
+        self.fs.mds_restart()
+        self.fs.recreate()
+
+        # Enable standby replay, to cover the bug case #8811 where
+        # a standby replay might mistakenly end up trying to rewrite
+        # the journal at the same time as an active daemon.
+        self.fs.set_allow_standby_replay(True)
+
+        status = self.fs.wait_for_daemons()
+
+        self.assertTrue(self.fs.get_replay(status=status) is not None)
+
+        # Do some client work so that the log is populated with something.
+        with self.mount_a.mounted_wait():
+            self.mount_a.create_files()
+            self.mount_a.check_files()  # sanity, this should always pass
+
+            # Run a more substantial workunit so that the length of the log to be
+            # coverted is going span at least a few segments
+            workunit(self.ctx, {
+                'clients': {
+                    "client.{0}".format(self.mount_a.client_id): ["suites/fsstress.sh"],
+                },
+                "timeout": "3h"
+            })
+
+        # Modify the ceph.conf to ask the MDS to use the new journal format.
+        self.fs.set_ceph_conf('mds', 'mds journal format', new_journal_version)
+
+        # Restart the MDS.
+        self.fs.mds_fail_restart()
+
+        # This ensures that all daemons come up into a valid state
+        status = self.fs.wait_for_daemons()
+
+        # Check that files created in the initial client workload are still visible
+        # in a client mount.
+        with self.mount_a.mounted_wait():
+            self.mount_a.check_files()
+
+        # Verify that the journal really has been rewritten.
+        journal_version = self.fs.get_journal_version()
+        if journal_version != new_journal_version:
+            raise RuntimeError("Journal was not upgraded, version should be {0} but is {1}".format(
+                new_journal_version, journal_version()
+            ))
+
+        # Verify that cephfs-journal-tool can now read the rewritten journal
+        inspect_out = self.fs.journal_tool(["journal", "inspect"], 0)
+        if not inspect_out.endswith(": OK"):
+            raise RuntimeError("Unexpected journal-tool result: '{0}'".format(
+                inspect_out
+            ))
+
+        self.fs.journal_tool(["event", "get", "json",
+                              "--path", "/tmp/journal.json"], 0)
+        p = self.fs.tool_remote.sh([
+                "python3",
+                "-c",
+                "import json; print(len(json.load(open('/tmp/journal.json'))))"
+            ])
+        event_count = int(p.strip())
+        if event_count < 1000:
+            # Approximate value of "lots", expected from having run fsstress
+            raise RuntimeError("Unexpectedly few journal events: {0}".format(event_count))
+
+        # Do some client work to check that writing the log is still working
+        with self.mount_a.mounted_wait():
+            workunit(self.ctx, {
+                'clients': {
+                    "client.{0}".format(self.mount_a.client_id): ["fs/misc/trivial_sync.sh"],
+                },
+                "timeout": "3h"
+            })
+
+        # Check that both an active and a standby replay are still up
+        status = self.fs.status()
+        self.assertEqual(len(list(self.fs.get_replays(status=status))), 1)
+        self.assertEqual(len(list(self.fs.get_ranks(status=status))), 1)
diff --git a/qa/tasks/cephfs/test_journal_repair.py b/qa/tasks/cephfs/test_journal_repair.py
new file mode 100644
index 000000000..b810e1a28
--- /dev/null
+++ b/qa/tasks/cephfs/test_journal_repair.py
@@ -0,0 +1,428 @@
+
+"""
+Test our tools for recovering the content of damaged journals
+"""
+
+import json
+import logging
+from textwrap import dedent
+import time
+
+from teuthology.exceptions import CommandFailedError, ConnectionLostError
+from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+from tasks.workunit import task as workunit
+
+log = logging.getLogger(__name__)
+
+
+class TestJournalRepair(CephFSTestCase):
+    MDSS_REQUIRED = 2
+
+    def test_inject_to_empty(self):
+        """
+        That when some dentries in the journal but nothing is in
+        the backing store, we correctly populate the backing store
+        from the journalled dentries.
+        """
+
+        # Inject metadata operations
+        self.mount_a.run_shell(["touch", "rootfile"])
+        self.mount_a.run_shell(["mkdir", "subdir"])
+        self.mount_a.run_shell(["touch", "subdir/subdirfile"])
+        # There are several different paths for handling hardlinks, depending
+        # on whether an existing dentry (being overwritten) is also a hardlink
+        self.mount_a.run_shell(["mkdir", "linkdir"])
+
+        # Test inode -> remote transition for a dentry
+        self.mount_a.run_shell(["touch", "linkdir/link0"])
+        self.mount_a.run_shell(["rm", "-f", "linkdir/link0"])
+        self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link0"])
+
+        # Test nothing -> remote transition
+        self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link1"])
+
+        # Test remote -> inode transition
+        self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link2"])
+        self.mount_a.run_shell(["rm", "-f", "linkdir/link2"])
+        self.mount_a.run_shell(["touch", "linkdir/link2"])
+
+        # Test remote -> diff remote transition
+        self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link3"])
+        self.mount_a.run_shell(["rm", "-f", "linkdir/link3"])
+        self.mount_a.run_shell(["ln", "rootfile", "linkdir/link3"])
+
+        # Test an empty directory
+        self.mount_a.run_shell(["mkdir", "subdir/subsubdir"])
+        self.mount_a.run_shell(["sync"])
+
+        # Before we unmount, make a note of the inode numbers, later we will
+        # check that they match what we recover from the journal
+        rootfile_ino = self.mount_a.path_to_ino("rootfile")
+        subdir_ino = self.mount_a.path_to_ino("subdir")
+        linkdir_ino = self.mount_a.path_to_ino("linkdir")
+        subdirfile_ino = self.mount_a.path_to_ino("subdir/subdirfile")
+        subsubdir_ino = self.mount_a.path_to_ino("subdir/subsubdir")
+
+        self.mount_a.umount_wait()
+
+        # Stop the MDS
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # Now, the journal should contain the operations, but the backing
+        # store shouldn't
+        with self.assertRaises(ObjectNotFound):
+            self.fs.list_dirfrag(subdir_ino)
+        self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
+
+        # Execute the dentry recovery, this should populate the backing store
+        self.fs.journal_tool(['event', 'recover_dentries', 'list'], 0)
+
+        # Dentries in ROOT_INO are present
+        self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head']))
+        self.assertEqual(self.fs.list_dirfrag(subdir_ino), ['subdirfile_head', 'subsubdir_head'])
+        self.assertEqual(sorted(self.fs.list_dirfrag(linkdir_ino)),
+                         sorted(['link0_head', 'link1_head', 'link2_head', 'link3_head']))
+
+        # Now check the MDS can read what we wrote: truncate the journal
+        # and start the mds.
+        self.fs.journal_tool(['journal', 'reset'], 0)
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        # List files
+        self.mount_a.mount_wait()
+
+        # First ls -R to populate MDCache, such that hardlinks will
+        # resolve properly (recover_dentries does not create backtraces,
+        # so ordinarily hardlinks to inodes that happen not to have backtraces
+        # will be invisible in readdir).
+        # FIXME: hook in forward scrub here to regenerate backtraces
+        proc = self.mount_a.run_shell(['ls', '-R'])
+        self.mount_a.umount_wait()  # remount to clear client cache before our second ls
+        self.mount_a.mount_wait()
+
+        proc = self.mount_a.run_shell(['ls', '-R'])
+        self.assertEqual(proc.stdout.getvalue().strip(),
+                         dedent("""
+                         .:
+                         linkdir
+                         rootfile
+                         subdir
+
+                         ./linkdir:
+                         link0
+                         link1
+                         link2
+                         link3
+
+                         ./subdir:
+                         subdirfile
+                         subsubdir
+
+                         ./subdir/subsubdir:
+                         """).strip())
+
+        # Check the correct inos were preserved by path
+        self.assertEqual(rootfile_ino, self.mount_a.path_to_ino("rootfile"))
+        self.assertEqual(subdir_ino, self.mount_a.path_to_ino("subdir"))
+        self.assertEqual(subdirfile_ino, self.mount_a.path_to_ino("subdir/subdirfile"))
+        self.assertEqual(subsubdir_ino, self.mount_a.path_to_ino("subdir/subsubdir"))
+
+        # Check that the hard link handling came out correctly
+        self.assertEqual(self.mount_a.path_to_ino("linkdir/link0"), subdirfile_ino)
+        self.assertEqual(self.mount_a.path_to_ino("linkdir/link1"), subdirfile_ino)
+        self.assertNotEqual(self.mount_a.path_to_ino("linkdir/link2"), subdirfile_ino)
+        self.assertEqual(self.mount_a.path_to_ino("linkdir/link3"), rootfile_ino)
+
+        # Create a new file, ensure it is not issued the same ino as one of the
+        # recovered ones
+        self.mount_a.run_shell(["touch", "afterwards"])
+        new_ino = self.mount_a.path_to_ino("afterwards")
+        self.assertNotIn(new_ino, [rootfile_ino, subdir_ino, subdirfile_ino])
+
+        # Check that we can do metadata ops in the recovered directory
+        self.mount_a.run_shell(["touch", "subdir/subsubdir/subsubdirfile"])
+
+    @for_teuthology # 308s
+    def test_reset(self):
+        """
+        That after forcibly modifying the backing store, we can get back into
+        a good state by resetting the MDSMap.
+
+        The scenario is that we have two active MDSs, and we lose the journals.  Once
+        we have completely lost confidence in the integrity of the metadata, we want to
+        return the system to a single-MDS state to go into a scrub to recover what we
+        can.
+        """
+
+        # Set max_mds to 2
+        self.fs.set_max_mds(2)
+        status = self.fs.wait_for_daemons()
+        active_mds_names = self.fs.get_active_names(status=status)
+
+        # Switch off any unneeded MDS daemons
+        for unneeded_mds in set(self.mds_cluster.mds_ids) - set(active_mds_names):
+            self.mds_cluster.mds_stop(unneeded_mds)
+            self.mds_cluster.mds_fail(unneeded_mds)
+
+        # Create a dir on each rank
+        self.mount_a.run_shell_payload("mkdir {alpha,bravo} && touch {alpha,bravo}/file")
+        self.mount_a.setfattr("alpha/", "ceph.dir.pin", "0")
+        self.mount_a.setfattr("bravo/", "ceph.dir.pin", "1")
+
+        # Ensure the pinning has taken effect and the /bravo dir is now
+        # migrated to rank 1.
+        self._wait_subtrees([('/bravo', 1), ('/alpha', 0)], rank=0, status=status)
+
+        # Do some IO (this should be split across ranks according to
+        # the rank-pinned dirs)
+        self.mount_a.create_n_files("alpha/file", 1000)
+        self.mount_a.create_n_files("bravo/file", 1000)
+
+        # Flush the journals so that we have some backing store data
+        # belonging to one MDS, and some to the other MDS.
+        for mds_name in active_mds_names:
+            self.fs.mds_asok(["flush", "journal"], mds_name)
+
+        # Stop (hard) the second MDS daemon
+        self.fs.mds_stop(active_mds_names[1])
+
+        # Wipe out the tables for MDS rank 1 so that it is broken and can't start
+        # (this is the simulated failure that we will demonstrate that the disaster
+        #  recovery tools can get us back from)
+        self.fs.erase_metadata_objects(prefix="mds1_")
+
+        # Try to access files from the client
+        blocked_ls = self.mount_a.run_shell(["ls", "-R"], wait=False)
+
+        # Check that this "ls -R" blocked rather than completing: indicates
+        # it got stuck trying to access subtrees which were on the now-dead MDS.
+        log.info("Sleeping to check ls is blocked...")
+        time.sleep(60)
+        self.assertFalse(blocked_ls.finished)
+
+        # This mount is now useless because it will depend on MDS rank 1, and MDS rank 1
+        # is not coming back.  Kill it.
+        log.info("Killing mount, it's blocked on the MDS we killed")
+        self.mount_a.kill()
+        self.mount_a.kill_cleanup()
+        try:
+            # Now that the mount is dead, the ls -R should error out.
+            blocked_ls.wait()
+        except (CommandFailedError, ConnectionLostError):
+            # The ConnectionLostError case is for kernel client, where
+            # killing the mount also means killing the node.
+            pass
+
+        # See that the second MDS will crash when it starts and tries to
+        # acquire rank 1
+        damaged_id = active_mds_names[1]
+        self.fs.mds_restart(damaged_id)
+
+        # The daemon taking the damaged rank should start starting, then
+        # restart back into standby after asking the mon to mark the rank
+        # damaged.
+        def is_marked_damaged():
+            mds_map = self.fs.get_mds_map()
+            return 1 in mds_map['damaged']
+
+        self.wait_until_true(is_marked_damaged, 60)
+
+        def get_state():
+            info = self.mds_cluster.get_mds_info(damaged_id)
+            return info['state'] if info is not None else None
+
+        self.wait_until_equal(
+                get_state,
+                "up:standby",
+                timeout=60)
+
+        self.fs.mds_stop(damaged_id)
+        self.fs.mds_fail(damaged_id)
+
+        # Now give up and go through a disaster recovery procedure
+        self.fs.mds_stop(active_mds_names[0])
+        self.fs.mds_fail(active_mds_names[0])
+        # Invoke recover_dentries quietly, because otherwise log spews millions of lines
+        self.fs.journal_tool(["event", "recover_dentries", "summary"], 0, quiet=True)
+        self.fs.journal_tool(["event", "recover_dentries", "summary"], 1, quiet=True)
+        self.fs.table_tool(["0", "reset", "session"])
+        self.fs.journal_tool(["journal", "reset"], 0)
+        self.fs.erase_mds_objects(1)
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
+                '--yes-i-really-mean-it')
+
+        # Bring an MDS back online, mount a client, and see that we can walk the full
+        # filesystem tree again
+        self.fs.mds_fail_restart(active_mds_names[0])
+        self.wait_until_equal(lambda: self.fs.get_active_names(), [active_mds_names[0]], 30,
+                              reject_fn=lambda v: len(v) > 1)
+        self.mount_a.mount_wait()
+        self.mount_a.run_shell(["ls", "-R"], wait=True)
+
+    def test_table_tool(self):
+        active_mdss = self.fs.get_active_names()
+        self.assertEqual(len(active_mdss), 1)
+        mds_name = active_mdss[0]
+
+        self.mount_a.run_shell(["touch", "foo"])
+        self.fs.mds_asok(["flush", "journal"], mds_name)
+
+        log.info(self.fs.table_tool(["all", "show", "inode"]))
+        log.info(self.fs.table_tool(["all", "show", "snap"]))
+        log.info(self.fs.table_tool(["all", "show", "session"]))
+
+        # Inode table should always be the same because initial state
+        # and choice of inode are deterministic.
+        # Should see one inode consumed
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "show", "inode"])),
+            {"0": {
+                "data": {
+                    "version": 2,
+                    "inotable": {
+                        "projected_free": [
+                            {"start": 1099511628777,
+                             "len": 1099511626775}],
+                        "free": [
+                            {"start": 1099511628777,
+                             "len": 1099511626775}]}},
+                "result": 0}}
+
+        )
+
+        # Should see one session
+        session_data = json.loads(self.fs.table_tool(
+            ["all", "show", "session"]))
+        self.assertEqual(len(session_data["0"]["data"]["sessions"]), 1)
+        self.assertEqual(session_data["0"]["result"], 0)
+
+        # Should see no snaps
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "show", "snap"])),
+            {"version": 1,
+             "snapserver": {"last_snap": 1,
+                            "last_created": 1,
+                            "last_destroyed": 1,
+                            "pending_noop": [],
+                            "snaps": [],
+                            "need_to_purge": {},
+                            "pending_update": [],
+                            "pending_destroy": []},
+             "result": 0}
+        )
+
+        # Reset everything
+        for table in ["session", "inode", "snap"]:
+            self.fs.table_tool(["all", "reset", table])
+
+        log.info(self.fs.table_tool(["all", "show", "inode"]))
+        log.info(self.fs.table_tool(["all", "show", "snap"]))
+        log.info(self.fs.table_tool(["all", "show", "session"]))
+
+        # Should see 0 sessions
+        session_data = json.loads(self.fs.table_tool(
+            ["all", "show", "session"]))
+        self.assertEqual(len(session_data["0"]["data"]["sessions"]), 0)
+        self.assertEqual(session_data["0"]["result"], 0)
+
+        # Should see entire inode range now marked free
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "show", "inode"])),
+            {"0": {"data": {"version": 1,
+                            "inotable": {"projected_free": [
+                                {"start": 1099511627776,
+                                 "len": 1099511627776}],
+                                 "free": [
+                                    {"start": 1099511627776,
+                                    "len": 1099511627776}]}},
+                   "result": 0}}
+        )
+
+        # Should see no snaps
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "show", "snap"])),
+            {"version": 1,
+             "snapserver": {"last_snap": 1,
+                            "last_created": 1,
+                            "last_destroyed": 1,
+                            "pending_noop": [],
+                            "snaps": [],
+                            "need_to_purge": {},
+                            "pending_update": [],
+                            "pending_destroy": []},
+             "result": 0}
+        )
+
+    def test_table_tool_take_inos(self):
+        initial_range_start = 1099511627776
+        initial_range_len = 1099511627776
+        # Initially a completely clear range
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "show", "inode"])),
+            {"0": {"data": {"version": 0,
+                            "inotable": {"projected_free": [
+                                {"start": initial_range_start,
+                                 "len": initial_range_len}],
+                                "free": [
+                                    {"start": initial_range_start,
+                                     "len": initial_range_len}]}},
+                   "result": 0}}
+        )
+
+        # Remove some
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "take_inos", "{0}".format(initial_range_start + 100)])),
+            {"0": {"data": {"version": 1,
+                            "inotable": {"projected_free": [
+                                {"start": initial_range_start + 101,
+                                 "len": initial_range_len - 101}],
+                                "free": [
+                                    {"start": initial_range_start + 101,
+                                     "len": initial_range_len - 101}]}},
+                   "result": 0}}
+        )
+
+    @for_teuthology  # Hack: "for_teuthology" because .sh doesn't work outside teuth
+    def test_journal_smoke(self):
+        workunit(self.ctx, {
+            'clients': {
+                "client.{0}".format(self.mount_a.client_id): [
+                    "fs/misc/trivial_sync.sh"],
+            },
+            "timeout": "1h"
+        })
+
+        for mount in self.mounts:
+            mount.umount_wait()
+
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # journal tool smoke
+        workunit(self.ctx, {
+            'clients': {
+                "client.{0}".format(self.mount_a.client_id): [
+                    "suites/cephfs_journal_tool_smoke.sh"],
+            },
+            "timeout": "1h"
+        })
+
+
+
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+
+        self.mount_a.mount_wait()
+
+        # trivial sync moutn a
+        workunit(self.ctx, {
+            'clients': {
+                "client.{0}".format(self.mount_a.client_id): [
+                    "fs/misc/trivial_sync.sh"],
+            },
+            "timeout": "1h"
+        })
+
diff --git a/qa/tasks/cephfs/test_mantle.py b/qa/tasks/cephfs/test_mantle.py
new file mode 100644
index 000000000..746c2ffe3
--- /dev/null
+++ b/qa/tasks/cephfs/test_mantle.py
@@ -0,0 +1,111 @@
+from io import StringIO
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+import json
+import logging
+
+log = logging.getLogger(__name__)
+failure = "using old balancer; mantle failed for balancer="
+success = "mantle balancer version changed: "
+
+class TestMantle(CephFSTestCase):
+    def start_mantle(self):
+        self.wait_for_health_clear(timeout=30)
+        self.fs.set_max_mds(2)
+        self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
+                              reject_fn=lambda v: v > 2 or v < 1)
+
+        for m in self.fs.get_active_names():
+            self.fs.mds_asok(['config', 'set', 'debug_objecter', '20'], mds_id=m)
+            self.fs.mds_asok(['config', 'set', 'debug_ms', '0'], mds_id=m)
+            self.fs.mds_asok(['config', 'set', 'debug_mds', '0'], mds_id=m)
+            self.fs.mds_asok(['config', 'set', 'debug_mds_balancer', '5'], mds_id=m)
+
+    def push_balancer(self, obj, lua_code, expect):
+        self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', obj)
+        self.fs.radosm(["put", obj, "-"], stdin=StringIO(lua_code))
+        with self.assert_cluster_log(failure + obj + " " + expect):
+            log.info("run a " + obj + " balancer that expects=" + expect)
+
+    def test_version_empty(self):
+        self.start_mantle()
+        expect = " : (2) No such file or directory"
+
+        ret = self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer')
+        assert(ret == 22) # EINVAL
+
+        self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', " ")
+        with self.assert_cluster_log(failure + " " + expect): pass
+
+    def test_version_not_in_rados(self):
+        self.start_mantle()
+        expect = failure + "ghost.lua : (2) No such file or directory"
+        self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "ghost.lua")
+        with self.assert_cluster_log(expect): pass
+
+    def test_balancer_invalid(self):
+        self.start_mantle()
+        expect = ": (22) Invalid argument"
+
+        lua_code = "this is invalid lua code!"
+        self.push_balancer("invalid.lua", lua_code, expect)
+
+        lua_code = "BAL_LOG()"
+        self.push_balancer("invalid_log.lua", lua_code, expect)
+
+        lua_code = "BAL_LOG(0)"
+        self.push_balancer("invalid_log_again.lua", lua_code, expect)
+
+    def test_balancer_valid(self):
+        self.start_mantle()
+        lua_code = "BAL_LOG(0, \"test\")\nreturn {3, 4}"
+        self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "valid.lua")
+        self.fs.radosm(["put", "valid.lua", "-"], stdin=StringIO(lua_code))
+        with self.assert_cluster_log(success + "valid.lua"):
+            log.info("run a valid.lua balancer")
+
+    def test_return_invalid(self):
+        self.start_mantle()
+        expect = ": (22) Invalid argument"
+
+        lua_code = "return \"hello\""
+        self.push_balancer("string.lua", lua_code, expect)
+
+        lua_code = "return 3"
+        self.push_balancer("number.lua", lua_code, expect)
+
+        lua_code = "return {}"
+        self.push_balancer("dict_empty.lua", lua_code, expect)
+
+        lua_code = "return {\"this\", \"is\", \"a\", \"test\"}"
+        self.push_balancer("dict_of_strings.lua", lua_code, expect)
+
+        lua_code = "return {3, \"test\"}"
+        self.push_balancer("dict_of_mixed.lua", lua_code, expect)
+
+        lua_code = "return {3}"
+        self.push_balancer("not_enough_numbers.lua", lua_code, expect)
+
+        lua_code = "return {3, 4, 5, 6, 7, 8, 9}"
+        self.push_balancer("too_many_numbers.lua", lua_code, expect)
+
+    def test_dead_osd(self):
+        self.start_mantle()
+        expect = " : (110) Connection timed out"
+
+        # kill the OSDs so that the balancer pull from RADOS times out
+        osd_map = json.loads(self.fs.mon_manager.raw_cluster_cmd('osd', 'dump', '--format=json-pretty'))
+        for i in range(0, len(osd_map['osds'])):
+          self.fs.mon_manager.raw_cluster_cmd_result('osd', 'down', str(i))
+          self.fs.mon_manager.raw_cluster_cmd_result('osd', 'out', str(i))
+
+        # trigger a pull from RADOS
+        self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "valid.lua")
+
+        # make the timeout a little longer since dead OSDs spam ceph -w
+        with self.assert_cluster_log(failure + "valid.lua" + expect, timeout=30):
+            log.info("run a balancer that should timeout")
+
+        # cleanup
+        for i in range(0, len(osd_map['osds'])):
+          self.fs.mon_manager.raw_cluster_cmd_result('osd', 'in', str(i))
diff --git a/qa/tasks/cephfs/test_mds_metrics.py b/qa/tasks/cephfs/test_mds_metrics.py
new file mode 100644
index 000000000..0a4b54a5f
--- /dev/null
+++ b/qa/tasks/cephfs/test_mds_metrics.py
@@ -0,0 +1,643 @@
+import os
+import json
+import time
+import random
+import logging
+import errno
+
+from teuthology.contextutil import safe_while, MaxWhileTries
+from teuthology.exceptions import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+class TestMDSMetrics(CephFSTestCase):
+    CLIENTS_REQUIRED = 2
+    MDSS_REQUIRED = 3
+
+    TEST_DIR_PERFIX = "test_mds_metrics"
+
+    def setUp(self):
+        super(TestMDSMetrics, self).setUp()
+        self._start_with_single_active_mds()
+        self._enable_mgr_stats_plugin()
+
+    def tearDown(self):
+        self._disable_mgr_stats_plugin()
+        super(TestMDSMetrics, self).tearDown()
+
+    def _start_with_single_active_mds(self):
+        curr_max_mds = self.fs.get_var('max_mds')
+        if curr_max_mds > 1:
+            self.fs.shrink(1)
+
+    def verify_mds_metrics(self, active_mds_count=1, client_count=1, ranks=[], mul_fs=[]):
+        def verify_metrics_cbk(metrics):
+            mds_metrics = metrics['metrics']
+            if not len(mds_metrics) == active_mds_count + 1: # n active mdss + delayed set
+                return False
+            fs_status = self.fs.status()
+            nonlocal ranks, mul_fs
+            if not ranks:
+                if not mul_fs:
+                    mul_fs = [self.fs.id]
+                for filesystem in mul_fs:
+                    ranks = set([info['rank'] for info in fs_status.get_ranks(filesystem)])
+            for rank in ranks:
+                r = mds_metrics.get("mds.{}".format(rank), None)
+                if not r or not len(mds_metrics['delayed_ranks']) == 0:
+                    return False
+            for item in mul_fs:
+                key = fs_status.get_fsmap(item)['mdsmap']['fs_name']
+                global_metrics = metrics['global_metrics'].get(key, {})
+                client_metadata = metrics['client_metadata'].get(key, {})
+                if not len(global_metrics) >= client_count or not len(client_metadata) >= client_count:
+                    return False
+            return True
+        return verify_metrics_cbk
+
+    def _fs_perf_stats(self, *args):
+        return self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "perf", "stats", *args)
+
+    def _enable_mgr_stats_plugin(self):
+        return self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable", "stats")
+
+    def _disable_mgr_stats_plugin(self):
+        return self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "disable", "stats")
+
+    def _spread_directory_on_all_ranks(self, fscid):
+        fs_status = self.fs.status()
+        ranks = set([info['rank'] for info in fs_status.get_ranks(fscid)])
+        # create a per-rank pinned directory
+        for rank in ranks:
+            dirname = "{0}_{1}".format(TestMDSMetrics.TEST_DIR_PERFIX, rank)
+            self.mount_a.run_shell(["mkdir", dirname])
+            self.mount_a.setfattr(dirname, "ceph.dir.pin", str(rank))
+            log.info("pinning directory {0} to rank {1}".format(dirname, rank))
+            for i in range(16):
+                filename = "{0}.{1}".format("test", i)
+                self.mount_a.write_n_mb(os.path.join(dirname, filename), 1)
+
+    def _do_spread_io(self, fscid):
+        # spread readdir I/O
+        self.mount_b.run_shell(["find", "."])
+
+    def _do_spread_io_all_clients(self, fscid):
+        # spread readdir I/O
+        self.mount_a.run_shell(["find", "."])
+        self.mount_b.run_shell(["find", "."])
+
+    def _cleanup_test_dirs(self):
+        dirnames = self.mount_a.run_shell(["ls"]).stdout.getvalue()
+        for dirname in dirnames.split("\n"):
+            if dirname.startswith(TestMDSMetrics.TEST_DIR_PERFIX):
+                log.info("cleaning directory {}".format(dirname))
+                self.mount_a.run_shell(["rm", "-rf", dirname])
+
+    def _get_metrics(self, verifier_callback, trials, *args):
+        metrics = None
+        done = False
+        with safe_while(sleep=1, tries=trials, action='wait for metrics') as proceed:
+            while proceed():
+                metrics = json.loads(self._fs_perf_stats(*args))
+                done = verifier_callback(metrics)
+                if done:
+                    break
+        return done, metrics
+
+    def _setup_fs(self, fs_name):
+        fs_a = self.mds_cluster.newfs(name=fs_name)
+
+        self.mds_cluster.mds_restart()
+
+        # Wait for filesystem to go healthy
+        fs_a.wait_for_daemons()
+
+        # Reconfigure client auth caps
+        for mount in self.mounts:
+            self.mds_cluster.mon_manager.raw_cluster_cmd_result(
+                'auth', 'caps', f"client.{mount.client_id}",
+                'mds', 'allow',
+                'mon', 'allow r',
+                'osd', f'allow rw pool={fs_a.get_data_pool_name()}')
+
+        return fs_a
+
+    # basic check to verify if we get back metrics from each active mds rank
+
+    def test_metrics_from_rank(self):
+        # validate
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+        log.debug("metrics={0}".format(metrics))
+        self.assertTrue(valid)
+
+    def test_metrics_post_client_disconnection(self):
+        # validate
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+        log.debug("metrics={0}".format(metrics))
+        self.assertTrue(valid)
+
+        self.mount_a.umount_wait()
+
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED - 1), 30)
+        log.debug("metrics={0}".format(metrics))
+        self.assertTrue(valid)
+
+    def test_metrics_mds_grow(self):
+        # validate
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+        log.debug("metrics={0}".format(metrics))
+        self.assertTrue(valid)
+
+        # grow the mds cluster
+        self.fs.grow(2)
+
+        fscid = self.fs.id
+        # spread directory per rank
+        self._spread_directory_on_all_ranks(fscid)
+
+        # spread some I/O
+        self._do_spread_io(fscid)
+
+        # wait a bit for mgr to get updated metrics
+        time.sleep(5)
+
+        # validate
+        valid, metrics = self._get_metrics(self.verify_mds_metrics(
+            active_mds_count=2, client_count=TestMDSMetrics.CLIENTS_REQUIRED) , 30)
+        log.debug("metrics={0}".format(metrics))
+        self.assertTrue(valid)
+
+        # cleanup test directories
+        self._cleanup_test_dirs()
+
+    def test_metrics_mds_grow_and_shrink(self):
+        # validate
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+        log.debug("metrics={0}".format(metrics))
+        self.assertTrue(valid)
+
+        # grow the mds cluster
+        self.fs.grow(2)
+
+        fscid = self.fs.id
+        # spread directory per rank
+        self._spread_directory_on_all_ranks(fscid)
+
+        # spread some I/O
+        self._do_spread_io(fscid)
+
+        # wait a bit for mgr to get updated metrics
+        time.sleep(5)
+
+        # validate
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(active_mds_count=2, client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+        log.debug("metrics={0}".format(metrics))
+        self.assertTrue(valid)
+
+        # shrink mds cluster
+        self.fs.shrink(1)
+
+        # wait a bit for mgr to get updated metrics
+        time.sleep(5)
+
+        # validate
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+        log.debug("metrics={0}".format(metrics))
+        self.assertTrue(valid)
+
+        # cleanup test directories
+        self._cleanup_test_dirs()
+
+    def test_delayed_metrics(self):
+        # validate
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+        log.debug("metrics={0}".format(metrics))
+        self.assertTrue(valid)
+
+        # grow the mds cluster
+        self.fs.grow(2)
+
+        fscid = self.fs.id
+        # spread directory per rank
+        self._spread_directory_on_all_ranks(fscid)
+
+        # spread some I/O
+        self._do_spread_io(fscid)
+
+        # wait a bit for mgr to get updated metrics
+        time.sleep(5)
+
+        # validate
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(active_mds_count=2, client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+        log.debug("metrics={0}".format(metrics))
+        self.assertTrue(valid)
+
+        # do not give this mds any chance
+        delayed_rank = 1
+        mds_id_rank0 = self.fs.get_rank(rank=0)['name']
+        mds_id_rank1 = self.fs.get_rank(rank=1)['name']
+
+        self.fs.set_inter_mds_block(True, mds_id_rank0, mds_id_rank1)
+
+        def verify_delayed_metrics(metrics):
+            mds_metrics = metrics['metrics']
+            r = mds_metrics.get("mds.{}".format(delayed_rank), None)
+            if not r or not delayed_rank in mds_metrics['delayed_ranks']:
+                return False
+            return True
+        # validate
+        valid, metrics = self._get_metrics(verify_delayed_metrics, 30)
+        log.debug("metrics={0}".format(metrics))
+
+        self.assertTrue(valid)
+        self.fs.set_inter_mds_block(False, mds_id_rank0, mds_id_rank1)
+
+        # validate
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(active_mds_count=2, client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+        log.debug("metrics={0}".format(metrics))
+        self.assertTrue(valid)
+
+        # cleanup test directories
+        self._cleanup_test_dirs()
+
+    def test_query_mds_filter(self):
+        # validate
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+        log.debug("metrics={0}".format(metrics))
+        self.assertTrue(valid)
+
+        # grow the mds cluster
+        self.fs.grow(2)
+
+        fscid = self.fs.id
+        # spread directory per rank
+        self._spread_directory_on_all_ranks(fscid)
+
+        # spread some I/O
+        self._do_spread_io(fscid)
+
+        # wait a bit for mgr to get updated metrics
+        time.sleep(5)
+
+        # validate
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(active_mds_count=2, client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+        log.debug("metrics={0}".format(metrics))
+        self.assertTrue(valid)
+
+        filtered_mds = 1
+        def verify_filtered_mds_rank_metrics(metrics):
+        # checks if the metrics has only client_metadata and
+        # global_metrics filtered using --mds_rank=1
+            global_metrics = metrics['global_metrics'].get(self.fs.name, {})
+            client_metadata = metrics['client_metadata'].get(self.fs.name, {})
+            mds_metrics = metrics['metrics']
+            if len(mds_metrics) != 2 or f"mds.{filtered_mds}" not in mds_metrics:
+                return False
+            if len(global_metrics) > TestMDSMetrics.CLIENTS_REQUIRED or\
+                    len(client_metadata) > TestMDSMetrics.CLIENTS_REQUIRED:
+                return False
+            if len(set(global_metrics) - set(mds_metrics[f"mds.{filtered_mds}"])) or\
+                    len(set(client_metadata) - set(mds_metrics[f"mds.{filtered_mds}"])):
+                return False
+            return True
+        # initiate a new query with `--mds_rank` filter and validate if
+        # we get metrics *only* from that mds.
+        valid, metrics = self._get_metrics(verify_filtered_mds_rank_metrics, 30,
+                                           f'--mds_rank={filtered_mds}')
+        log.debug(f"metrics={metrics}")
+        self.assertTrue(valid, "Incorrect 'ceph fs perf stats' output"
+                        f" with filter '--mds_rank={filtered_mds}'")
+
+    def test_query_client_filter(self):
+        # validate
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+        log.debug("metrics={0}".format(metrics))
+        self.assertTrue(valid)
+
+        mds_metrics = metrics['metrics']
+        # pick an random client
+        client = random.choice(list(mds_metrics['mds.0'].keys()))
+        # could have used regex to extract client id
+        client_id = (client.split(' ')[0]).split('.')[-1]
+
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(client_count=1), 30, '--client_id={}'.format(client_id))
+        log.debug("metrics={0}".format(metrics))
+        self.assertTrue(valid)
+
+    def test_query_client_ip_filter(self):
+        # validate
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+        log.debug("metrics={0}".format(metrics))
+        self.assertTrue(valid)
+
+        client_matadata = metrics['client_metadata'][self.fs.name]
+        # pick an random client
+        client = random.choice(list(client_matadata.keys()))
+        # get IP of client to use in filter
+        client_ip = client_matadata[client]['IP']
+
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(client_count=1), 30, '--client_ip={}'.format(client_ip))
+        log.debug("metrics={0}".format(metrics))
+        self.assertTrue(valid)
+
+        # verify IP from output with filter IP
+        for i in metrics['client_metadata'][self.fs.name]:
+            self.assertEqual(client_ip, metrics['client_metadata'][self.fs.name][i]['IP'])
+
+    def test_query_mds_and_client_filter(self):
+        # validate
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+        log.debug("metrics={0}".format(metrics))
+        self.assertTrue(valid)
+
+        # grow the mds cluster
+        self.fs.grow(2)
+
+        fscid = self.fs.id
+        # spread directory per rank
+        self._spread_directory_on_all_ranks(fscid)
+
+        # spread some I/O
+        self._do_spread_io_all_clients(fscid)
+
+        # wait a bit for mgr to get updated metrics
+        time.sleep(5)
+
+        # validate
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(active_mds_count=2, client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+        log.debug("metrics={0}".format(metrics))
+        self.assertTrue(valid)
+
+        mds_metrics = metrics['metrics']
+
+        # pick an random client
+        client = random.choice(list(mds_metrics['mds.1'].keys()))
+        # could have used regex to extract client id
+        client_id = (client.split(' ')[0]).split('.')[-1]
+        filtered_mds = 1
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(client_count=1, ranks=[filtered_mds]),
+            30, '--mds_rank={}'.format(filtered_mds), '--client_id={}'.format(client_id))
+        log.debug("metrics={0}".format(metrics))
+        self.assertTrue(valid)
+
+    def test_for_invalid_mds_rank(self):
+        invalid_mds_rank = "1,"
+        # try, 'fs perf stat' command with invalid mds_rank
+        try:
+            self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "perf", "stats", "--mds_rank", invalid_mds_rank)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EINVAL:
+                raise
+        else:
+            raise RuntimeError("expected the 'fs perf stat' command to fail for invalid mds_rank")
+
+    def test_for_invalid_client_id(self):
+        invalid_client_id = "abcd"
+        # try, 'fs perf stat' command with invalid client_id
+        try:
+            self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "perf", "stats", "--client_id", invalid_client_id)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EINVAL:
+                raise
+        else:
+            raise RuntimeError("expected the 'fs perf stat' command to fail for invalid client_id")
+
+    def test_for_invalid_client_ip(self):
+        invalid_client_ip = "1.2.3"
+        # try, 'fs perf stat' command with invalid client_ip
+        try:
+            self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "perf", "stats", "--client_ip", invalid_client_ip)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EINVAL:
+                raise
+        else:
+            raise RuntimeError("expected the 'fs perf stat' command to fail for invalid client_ip")
+    
+    def test_perf_stats_stale_metrics(self):
+        """
+        That `ceph fs perf stats` doesn't output stale metrics after the rank0 MDS failover
+        """
+        # validate
+        valid, metrics = self._get_metrics(self.verify_mds_metrics(
+            active_mds_count=1, client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+        log.debug(f'metrics={metrics}')
+        self.assertTrue(valid)
+
+        # mount_a and mount_b are the clients mounted for TestMDSMetrics. So get their
+        # entries from the global_metrics.
+        client_a_name = f'client.{self.mount_a.get_global_id()}'
+        client_b_name = f'client.{self.mount_b.get_global_id()}'
+
+        global_metrics = metrics['global_metrics']
+        client_a_metrics = global_metrics[self.fs.name][client_a_name]
+        client_b_metrics = global_metrics[self.fs.name][client_b_name]
+
+        # fail rank0 mds
+        self.fs.rank_fail(rank=0)
+
+        # Wait for rank0 up:active state
+        self.fs.wait_for_state('up:active', rank=0, timeout=30)
+
+        fscid = self.fs.id
+
+        # spread directory per rank
+        self._spread_directory_on_all_ranks(fscid)
+
+        # spread some I/O
+        self._do_spread_io_all_clients(fscid)
+
+        # wait a bit for mgr to get updated metrics
+        time.sleep(5)
+
+        # validate
+        try:
+            valid, metrics_new = self._get_metrics(self.verify_mds_metrics(
+                active_mds_count=1, client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+            log.debug(f'metrics={metrics_new}')
+            self.assertTrue(valid)
+
+            client_metadata = metrics_new['client_metadata']
+            client_a_metadata = client_metadata.get(self.fs.name, {}).get(client_a_name, {})
+            client_b_metadata = client_metadata.get(self.fs.name, {}).get(client_b_name, {})
+
+            global_metrics = metrics_new['global_metrics']
+            client_a_metrics_new = global_metrics.get(self.fs.name, {}).get(client_a_name, {})
+            client_b_metrics_new = global_metrics.get(self.fs.name, {}).get(client_b_name, {})
+
+            # the metrics should be different for the test to succeed.
+            self.assertTrue(client_a_metadata and client_b_metadata and
+                            client_a_metrics_new and client_b_metrics_new and
+                            (client_a_metrics_new != client_a_metrics) and
+                            (client_b_metrics_new != client_b_metrics),
+                            "Invalid 'ceph fs perf stats' metrics after rank0 mds failover")
+        except MaxWhileTries:
+            raise RuntimeError("Failed to fetch 'ceph fs perf stats' metrics")
+        finally:
+            # cleanup test directories
+            self._cleanup_test_dirs()
+
+    def test_client_metrics_and_metadata(self):
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+        self.fs.delete_all_filesystems()
+
+        self.mds_cluster.mon_manager.raw_cluster_cmd("fs", "flag", "set",
+            "enable_multiple", "true", "--yes-i-really-mean-it")
+
+        # creating filesystem
+        fs_a = self._setup_fs(fs_name="fs1")
+
+        # Mount a client on fs_a
+        self.mount_a.mount_wait(cephfs_name=fs_a.name)
+        self.mount_a.write_n_mb("pad.bin", 1)
+        self.mount_a.write_n_mb("test.bin", 2)
+        self.mount_a.path_to_ino("test.bin")
+        self.mount_a.create_files()
+
+        # creating another filesystem
+        fs_b = self._setup_fs(fs_name="fs2")
+
+        # Mount a client on fs_b
+        self.mount_b.mount_wait(cephfs_name=fs_b.name)
+        self.mount_b.write_n_mb("test.bin", 1)
+        self.mount_b.path_to_ino("test.bin")
+        self.mount_b.create_files()
+
+        fscid_list = [fs_a.id, fs_b.id]
+
+        # validate
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(client_count=1, mul_fs=fscid_list), 30)
+        log.debug(f"metrics={metrics}")
+        self.assertTrue(valid)
+
+        client_metadata_a = metrics['client_metadata']['fs1']
+        client_metadata_b = metrics['client_metadata']['fs2']
+
+        for i in client_metadata_a:
+            if not (client_metadata_a[i]['hostname']):
+                raise RuntimeError("hostname of fs1 not found!")
+            if not (client_metadata_a[i]['valid_metrics']):
+                raise RuntimeError("valid_metrics of fs1 not found!")
+
+        for i in client_metadata_b:
+            if not (client_metadata_b[i]['hostname']):
+                raise RuntimeError("hostname of fs2 not found!")
+            if not (client_metadata_b[i]['valid_metrics']):
+                raise RuntimeError("valid_metrics of fs2 not found!")
+
+    def test_non_existing_mds_rank(self):
+        def verify_filtered_metrics(metrics):
+        # checks if the metrics has non empty client_metadata and global_metrics
+            if metrics['client_metadata'].get(self.fs.name, {})\
+              or metrics['global_metrics'].get(self.fs.name, {}):
+                return True
+            return False
+
+        try:
+            # validate
+            filter_rank = random.randint(1, 10)
+            valid, metrics = self._get_metrics(verify_filtered_metrics, 30,
+                                               '--mds_rank={}'.format(filter_rank))
+            log.info(f'metrics={metrics}')
+            self.assertFalse(valid, "Fetched 'ceph fs perf stats' metrics using nonexistent MDS rank")
+        except MaxWhileTries:
+            # success
+            pass
+
+    def test_perf_stats_stale_metrics_with_multiple_filesystem(self):
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        self.mds_cluster.mon_manager.raw_cluster_cmd("fs", "flag", "set",
+                    "enable_multiple", "true", "--yes-i-really-mean-it")
+
+        # creating filesystem
+        fs_b = self._setup_fs(fs_name="fs2")
+
+        # Mount a client on fs_b
+        self.mount_b.mount_wait(cephfs_name=fs_b.name)
+        self.mount_b.write_n_mb("test.bin", 1)
+        self.mount_b.path_to_ino("test.bin")
+        self.mount_b.create_files()
+
+        # creating another filesystem
+        fs_a = self._setup_fs(fs_name="fs1")
+
+        # Mount a client on fs_a
+        self.mount_a.mount_wait(cephfs_name=fs_a.name)
+        self.mount_a.write_n_mb("pad.bin", 1)
+        self.mount_a.write_n_mb("test.bin", 2)
+        self.mount_a.path_to_ino("test.bin")
+        self.mount_a.create_files()
+
+        # validate
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(client_count=1, mul_fs=[fs_a.id, fs_b.id]), 30)
+        log.debug(f"metrics={metrics}")
+        self.assertTrue(valid)
+
+        # get mounted client's entries from the global_metrics.
+        client_a_name = f'client.{self.mount_a.get_global_id()}'
+
+        global_metrics = metrics['global_metrics']
+        client_a_metrics = global_metrics.get("fs1", {}).get(client_a_name, {})
+
+        # fail active mds of fs_a
+        fs_a_mds = fs_a.get_active_names()[0]
+        self.mds_cluster.mds_fail(fs_a_mds)
+        fs_a.wait_for_state('up:active', rank=0, timeout=30)
+
+        # spread directory per rank
+        self._spread_directory_on_all_ranks(fs_a.id)
+
+        # spread some I/O
+        self._do_spread_io_all_clients(fs_a.id)
+
+        # wait a bit for mgr to get updated metrics
+        time.sleep(5)
+
+        # validate
+        try:
+            valid, metrics_new = self._get_metrics(
+                self.verify_mds_metrics(client_count=1, mul_fs=[fs_a.id, fs_b.id]), 30)
+            log.debug(f'metrics={metrics_new}')
+            self.assertTrue(valid)
+
+            client_metadata = metrics_new['client_metadata']
+            client_a_metadata = client_metadata.get("fs1", {}).get(client_a_name, {})
+
+            global_metrics = metrics_new['global_metrics']
+            client_a_metrics_new = global_metrics.get("fs1", {}).get(client_a_name, {})
+
+            # the metrics should be different for the test to succeed.
+            self.assertTrue(client_a_metadata and client_a_metrics_new
+                            and (client_a_metrics_new != client_a_metrics),
+                            "Invalid 'ceph fs perf stats' metrics after"
+                            f" rank0 mds of {fs_a.name} failover")
+        except MaxWhileTries:
+            raise RuntimeError("Failed to fetch `ceph fs perf stats` metrics")
+        finally:
+            # cleanup test directories
+            self._cleanup_test_dirs()
+
diff --git a/qa/tasks/cephfs/test_meta_injection.py b/qa/tasks/cephfs/test_meta_injection.py
new file mode 100644
index 000000000..916b30a25
--- /dev/null
+++ b/qa/tasks/cephfs/test_meta_injection.py
@@ -0,0 +1,38 @@
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+class TestMetaInjection(CephFSTestCase):
+    def test_meta_injection(self):
+        conf_ori = self.fs.mds_asok(['config', 'show'])
+        self.fs.mds_asok(['config', 'set', 'mds_log_max_segments', '1'])
+        self.mount_a.run_shell(["mkdir", "metadir"])
+        self.mount_a.run_shell(["touch", "metadir/metafile1"])
+        self.mount_a.run_shell(["touch", "metadir/metafile2"])
+        self.fs.mds_asok(['flush', 'journal'])
+        dirino = self.mount_a.path_to_ino("metadir") 
+        ino = self.mount_a.path_to_ino("metadir/metafile1")
+        
+        # export meta of ino
+        self.fs.meta_tool(['showm', '-i', str(ino), '-o', '/tmp/meta_out'], 0, True)
+        out = self.mount_a.run_shell(['grep', str(ino),'/tmp/meta_out']).stdout.getvalue().strip()
+        
+        # check the metadata of ino
+        self.assertNotEqual(out.find(u'"ino":'+ str(ino)), -1)
+        
+        # amend info of ino
+        self.fs.get_meta_of_fs_file(dirino, "metafile1", "/tmp/meta_obj")
+        self.fs.meta_tool(['amend', '-i', str(ino), '--in', '/tmp/meta_out', '--yes-i-really-really-mean-it'], 0, True)
+        self.fs.get_meta_of_fs_file(dirino, "metafile1", "/tmp/meta_obj_chg")
+        
+        # checkout meta_out after import it
+        ori_mds5 = self.mount_a.run_shell(["md5sum", "/tmp/meta_obj"]).stdout.getvalue().strip().split()
+        chg_mds5 = self.mount_a.run_shell(["md5sum", "/tmp/meta_obj_chg"]).stdout.getvalue().strip().split()
+        print(ori_mds5," ==> ", chg_mds5)
+        self.assertEqual(len(ori_mds5), 2)
+        self.assertEqual(len(chg_mds5), 2)
+        self.assertEqual(ori_mds5[0], chg_mds5[0])
+
+        self.mount_a.run_shell(["rm", "metadir", "-rf"])
+        self.mount_a.run_shell(["rm", "/tmp/meta_obj"])
+        self.mount_a.run_shell(["rm", "/tmp/meta_obj_chg"])
+        # restore config of mds_log_max_segments
+        self.fs.mds_asok(['config', 'set', 'mds_log_max_segments', conf_ori["mds_log_max_segments"]])
diff --git a/qa/tasks/cephfs/test_mirroring.py b/qa/tasks/cephfs/test_mirroring.py
new file mode 100644
index 000000000..a5f8cdac7
--- /dev/null
+++ b/qa/tasks/cephfs/test_mirroring.py
@@ -0,0 +1,1263 @@
+import os
+import json
+import errno
+import logging
+import random
+import time
+
+from io import StringIO
+from collections import deque
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.exceptions import CommandFailedError
+from teuthology.contextutil import safe_while
+
+log = logging.getLogger(__name__)
+
+class TestMirroring(CephFSTestCase):
+    MDSS_REQUIRED = 5
+    CLIENTS_REQUIRED = 2
+    REQUIRE_BACKUP_FILESYSTEM = True
+
+    MODULE_NAME = "mirroring"
+
+    def setUp(self):
+        super(TestMirroring, self).setUp()
+        self.primary_fs_name = self.fs.name
+        self.primary_fs_id = self.fs.id
+        self.secondary_fs_name = self.backup_fs.name
+        self.secondary_fs_id = self.backup_fs.id
+        self.enable_mirroring_module()
+
+    def tearDown(self):
+        self.disable_mirroring_module()
+        super(TestMirroring, self).tearDown()
+
+    def enable_mirroring_module(self):
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable", TestMirroring.MODULE_NAME)
+
+    def disable_mirroring_module(self):
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "disable", TestMirroring.MODULE_NAME)
+
+    def enable_mirroring(self, fs_name, fs_id):
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "enable", fs_name)
+        time.sleep(10)
+        # verify via asok
+        res = self.mirror_daemon_command(f'mirror status for fs: {fs_name}',
+                                         'fs', 'mirror', 'status', f'{fs_name}@{fs_id}')
+        self.assertTrue(res['peers'] == {})
+        self.assertTrue(res['snap_dirs']['dir_count'] == 0)
+
+    def disable_mirroring(self, fs_name, fs_id):
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "disable", fs_name)
+        time.sleep(10)
+        # verify via asok
+        try:
+            self.mirror_daemon_command(f'mirror status for fs: {fs_name}',
+                                       'fs', 'mirror', 'status', f'{fs_name}@{fs_id}')
+        except CommandFailedError:
+            pass
+        else:
+            raise RuntimeError('expected admin socket to be unavailable')
+
+    def verify_peer_added(self, fs_name, fs_id, peer_spec, remote_fs_name=None):
+        # verify via asok
+        res = self.mirror_daemon_command(f'mirror status for fs: {fs_name}',
+                                         'fs', 'mirror', 'status', f'{fs_name}@{fs_id}')
+        peer_uuid = self.get_peer_uuid(peer_spec)
+        self.assertTrue(peer_uuid in res['peers'])
+        client_name = res['peers'][peer_uuid]['remote']['client_name']
+        cluster_name = res['peers'][peer_uuid]['remote']['cluster_name']
+        self.assertTrue(peer_spec == f'{client_name}@{cluster_name}')
+        if remote_fs_name:
+            self.assertTrue(self.secondary_fs_name == res['peers'][peer_uuid]['remote']['fs_name'])
+        else:
+            self.assertTrue(self.fs_name == res['peers'][peer_uuid]['remote']['fs_name'])
+
+    def peer_add(self, fs_name, fs_id, peer_spec, remote_fs_name=None):
+        if remote_fs_name:
+            self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "peer_add", fs_name, peer_spec, remote_fs_name)
+        else:
+            self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "peer_add", fs_name, peer_spec)
+        time.sleep(10)
+        self.verify_peer_added(fs_name, fs_id, peer_spec, remote_fs_name)
+
+    def peer_remove(self, fs_name, fs_id, peer_spec):
+        peer_uuid = self.get_peer_uuid(peer_spec)
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "peer_remove", fs_name, peer_uuid)
+        time.sleep(10)
+        # verify via asok
+        res = self.mirror_daemon_command(f'mirror status for fs: {fs_name}',
+                                         'fs', 'mirror', 'status', f'{fs_name}@{fs_id}')
+        self.assertTrue(res['peers'] == {} and res['snap_dirs']['dir_count'] == 0)
+
+    def bootstrap_peer(self, fs_name, client_name, site_name):
+        outj = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            "fs", "snapshot", "mirror", "peer_bootstrap", "create", fs_name, client_name, site_name))
+        return outj['token']
+
+    def import_peer(self, fs_name, token):
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "peer_bootstrap", "import",
+                                                     fs_name, token)
+
+    def add_directory(self, fs_name, fs_id, dir_name):
+        # get initial dir count
+        res = self.mirror_daemon_command(f'mirror status for fs: {fs_name}',
+                                         'fs', 'mirror', 'status', f'{fs_name}@{fs_id}')
+        dir_count = res['snap_dirs']['dir_count']
+        log.debug(f'initial dir_count={dir_count}')
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "add", fs_name, dir_name)
+
+        time.sleep(10)
+        # verify via asok
+        res = self.mirror_daemon_command(f'mirror status for fs: {fs_name}',
+                                         'fs', 'mirror', 'status', f'{fs_name}@{fs_id}')
+        new_dir_count = res['snap_dirs']['dir_count']
+        log.debug(f'new dir_count={new_dir_count}')
+        self.assertTrue(new_dir_count > dir_count)
+
+    def remove_directory(self, fs_name, fs_id, dir_name):
+        # get initial dir count
+        res = self.mirror_daemon_command(f'mirror status for fs: {fs_name}',
+                                         'fs', 'mirror', 'status', f'{fs_name}@{fs_id}')
+        dir_count = res['snap_dirs']['dir_count']
+        log.debug(f'initial dir_count={dir_count}')
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "remove", fs_name, dir_name)
+
+        time.sleep(10)
+        # verify via asok
+        res = self.mirror_daemon_command(f'mirror status for fs: {fs_name}',
+                                         'fs', 'mirror', 'status', f'{fs_name}@{fs_id}')
+        new_dir_count = res['snap_dirs']['dir_count']
+        log.debug(f'new dir_count={new_dir_count}')
+        self.assertTrue(new_dir_count < dir_count)
+
+    def check_peer_status(self, fs_name, fs_id, peer_spec, dir_name, expected_snap_name,
+                          expected_snap_count):
+        peer_uuid = self.get_peer_uuid(peer_spec)
+        res = self.mirror_daemon_command(f'peer status for fs: {fs_name}',
+                                         'fs', 'mirror', 'peer', 'status',
+                                         f'{fs_name}@{fs_id}', peer_uuid)
+        self.assertTrue(dir_name in res)
+        self.assertTrue(res[dir_name]['last_synced_snap']['name'] == expected_snap_name)
+        self.assertTrue(res[dir_name]['snaps_synced'] == expected_snap_count)
+
+    def check_peer_status_deleted_snap(self, fs_name, fs_id, peer_spec, dir_name,
+                                      expected_delete_count):
+        peer_uuid = self.get_peer_uuid(peer_spec)
+        res = self.mirror_daemon_command(f'peer status for fs: {fs_name}',
+                                         'fs', 'mirror', 'peer', 'status',
+                                         f'{fs_name}@{fs_id}', peer_uuid)
+        self.assertTrue(dir_name in res)
+        self.assertTrue(res[dir_name]['snaps_deleted'] == expected_delete_count)
+
+    def check_peer_status_renamed_snap(self, fs_name, fs_id, peer_spec, dir_name,
+                                       expected_rename_count):
+        peer_uuid = self.get_peer_uuid(peer_spec)
+        res = self.mirror_daemon_command(f'peer status for fs: {fs_name}',
+                                         'fs', 'mirror', 'peer', 'status',
+                                         f'{fs_name}@{fs_id}', peer_uuid)
+        self.assertTrue(dir_name in res)
+        self.assertTrue(res[dir_name]['snaps_renamed'] == expected_rename_count)
+
+    def check_peer_snap_in_progress(self, fs_name, fs_id,
+                                    peer_spec, dir_name, snap_name):
+        peer_uuid = self.get_peer_uuid(peer_spec)
+        res = self.mirror_daemon_command(f'peer status for fs: {fs_name}',
+                                         'fs', 'mirror', 'peer', 'status',
+                                         f'{fs_name}@{fs_id}', peer_uuid)
+        self.assertTrue('syncing' == res[dir_name]['state'])
+        self.assertTrue(res[dir_name]['current_sycning_snap']['name'] == snap_name)
+
+    def verify_snapshot(self, dir_name, snap_name):
+        snap_list = self.mount_b.ls(path=f'{dir_name}/.snap')
+        self.assertTrue(snap_name in snap_list)
+
+        source_res = self.mount_a.dir_checksum(path=f'{dir_name}/.snap/{snap_name}',
+                                               follow_symlinks=True)
+        log.debug(f'source snapshot checksum {snap_name} {source_res}')
+
+        dest_res = self.mount_b.dir_checksum(path=f'{dir_name}/.snap/{snap_name}',
+                                             follow_symlinks=True)
+        log.debug(f'destination snapshot checksum {snap_name} {dest_res}')
+        self.assertTrue(source_res == dest_res)
+
+    def verify_failed_directory(self, fs_name, fs_id, peer_spec, dir_name):
+        peer_uuid = self.get_peer_uuid(peer_spec)
+        res = self.mirror_daemon_command(f'peer status for fs: {fs_name}',
+                                         'fs', 'mirror', 'peer', 'status',
+                                         f'{fs_name}@{fs_id}', peer_uuid)
+        self.assertTrue('failed' == res[dir_name]['state'])
+
+    def get_peer_uuid(self, peer_spec):
+        status = self.fs.status()
+        fs_map = status.get_fsmap_byname(self.primary_fs_name)
+        peers = fs_map['mirror_info']['peers']
+        for peer_uuid, mirror_info in peers.items():
+            client_name = mirror_info['remote']['client_name']
+            cluster_name = mirror_info['remote']['cluster_name']
+            remote_peer_spec = f'{client_name}@{cluster_name}'
+            if peer_spec == remote_peer_spec:
+                return peer_uuid
+        return None
+
+    def get_daemon_admin_socket(self):
+        """overloaded by teuthology override (fs/mirror/clients/mirror.yaml)"""
+        return "/var/run/ceph/cephfs-mirror.asok"
+
+    def get_mirror_daemon_pid(self):
+        """pid file overloaded in fs/mirror/clients/mirror.yaml"""
+        return self.mount_a.run_shell(['cat', '/var/run/ceph/cephfs-mirror.pid']).stdout.getvalue().strip()
+
+    def get_mirror_rados_addr(self, fs_name, fs_id):
+        """return the rados addr used by cephfs-mirror instance"""
+        res = self.mirror_daemon_command(f'mirror status for fs: {fs_name}',
+                                         'fs', 'mirror', 'status', f'{fs_name}@{fs_id}')
+        return res['rados_inst']
+
+    def mirror_daemon_command(self, cmd_label, *args):
+        asok_path = self.get_daemon_admin_socket()
+        try:
+            # use mount_a's remote to execute command
+            p = self.mount_a.client_remote.run(args=
+                     ['ceph', '--admin-daemon', asok_path] + list(args),
+                     stdout=StringIO(), stderr=StringIO(), timeout=30,
+                     check_status=True, label=cmd_label)
+            p.wait()
+        except CommandFailedError as ce:
+            log.warn(f'mirror daemon command with label "{cmd_label}" failed: {ce}')
+            raise
+        res = p.stdout.getvalue().strip()
+        log.debug(f'command returned={res}')
+        return json.loads(res)
+
+    def get_mirror_daemon_status(self):
+        daemon_status = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "daemon", "status"))
+        log.debug(f'daemon_status: {daemon_status}')
+        # running a single mirror daemon is supported
+        status = daemon_status[0]
+        log.debug(f'status: {status}')
+        return status
+
+    def test_basic_mirror_commands(self):
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+    def test_mirror_peer_commands(self):
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+        # add peer
+        self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name)
+        # remove peer
+        self.peer_remove(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph")
+
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+    def test_mirror_disable_with_peer(self):
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+        # add peer
+        self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name)
+
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+    def test_matching_peer(self):
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+        try:
+            self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph")
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EINVAL:
+                raise RuntimeError('invalid errno when adding a matching remote peer')
+        else:
+            raise RuntimeError('adding a peer matching local spec should fail')
+
+        # verify via asok -- nothing should get added
+        res = self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}',
+                                         'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}')
+        self.assertTrue(res['peers'] == {})
+
+        # and explicitly specifying the spec (via filesystem name) should fail too
+        try:
+            self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.primary_fs_name)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EINVAL:
+                raise RuntimeError('invalid errno when adding a matching remote peer')
+        else:
+            raise RuntimeError('adding a peer matching local spec should fail')
+
+        # verify via asok -- nothing should get added
+        res = self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}',
+                                         'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}')
+        self.assertTrue(res['peers'] == {})
+
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+    def test_mirror_peer_add_existing(self):
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+        # add peer
+        self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name)
+
+        # adding the same peer should be idempotent
+        self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name)
+
+        # remove peer
+        self.peer_remove(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph")
+
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+    def test_peer_commands_with_mirroring_disabled(self):
+        # try adding peer when mirroring is not enabled
+        try:
+            self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EINVAL:
+                raise RuntimeError(-errno.EINVAL, 'incorrect error code when adding a peer')
+        else:
+            raise RuntimeError(-errno.EINVAL, 'expected peer_add to fail')
+
+        # try removing peer
+        try:
+            self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "peer_remove", self.primary_fs_name, 'dummy-uuid')
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EINVAL:
+                raise RuntimeError(-errno.EINVAL, 'incorrect error code when removing a peer')
+        else:
+            raise RuntimeError(-errno.EINVAL, 'expected peer_remove to fail')
+
+    def test_add_directory_with_mirroring_disabled(self):
+        # try adding a directory when mirroring is not enabled
+        try:
+            self.add_directory(self.primary_fs_name, self.primary_fs_id, "/d1")
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EINVAL:
+                raise RuntimeError(-errno.EINVAL, 'incorrect error code when adding a directory')
+        else:
+            raise RuntimeError(-errno.EINVAL, 'expected directory add to fail')
+
+    def test_directory_commands(self):
+        self.mount_a.run_shell(["mkdir", "d1"])
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d1')
+        try:
+            self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d1')
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EEXIST:
+                raise RuntimeError(-errno.EINVAL, 'incorrect error code when re-adding a directory')
+        else:
+            raise RuntimeError(-errno.EINVAL, 'expected directory add to fail')
+        self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d1')
+        try:
+            self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d1')
+        except CommandFailedError as ce:
+            if ce.exitstatus not in (errno.ENOENT, errno.EINVAL):
+                raise RuntimeError(-errno.EINVAL, 'incorrect error code when re-deleting a directory')
+        else:
+            raise RuntimeError(-errno.EINVAL, 'expected directory removal to fail')
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        self.mount_a.run_shell(["rmdir", "d1"])
+
+    def test_add_relative_directory_path(self):
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        try:
+            self.add_directory(self.primary_fs_name, self.primary_fs_id, './d1')
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EINVAL:
+                raise RuntimeError(-errno.EINVAL, 'incorrect error code when adding a relative path dir')
+        else:
+            raise RuntimeError(-errno.EINVAL, 'expected directory add to fail')
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+    def test_add_directory_path_normalization(self):
+        self.mount_a.run_shell(["mkdir", "-p", "d1/d2/d3"])
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d1/d2/d3')
+        def check_add_command_failure(dir_path):
+            try:
+                self.add_directory(self.primary_fs_name, self.primary_fs_id, dir_path)
+            except CommandFailedError as ce:
+                if ce.exitstatus != errno.EEXIST:
+                    raise RuntimeError(-errno.EINVAL, 'incorrect error code when re-adding a directory')
+            else:
+                raise RuntimeError(-errno.EINVAL, 'expected directory add to fail')
+
+        # everything points for /d1/d2/d3
+        check_add_command_failure('/d1/d2/././././././d3')
+        check_add_command_failure('/d1/d2/././././././d3//////')
+        check_add_command_failure('/d1/d2/../d2/././././d3')
+        check_add_command_failure('/././././d1/./././d2/./././d3//////')
+        check_add_command_failure('/./d1/./d2/./d3/../../../d1/d2/d3')
+
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        self.mount_a.run_shell(["rm", "-rf", "d1"])
+
+    def test_add_ancestor_and_child_directory(self):
+        self.mount_a.run_shell(["mkdir", "-p", "d1/d2/d3"])
+        self.mount_a.run_shell(["mkdir", "-p", "d1/d4"])
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d1/d2/')
+        def check_add_command_failure(dir_path):
+            try:
+                self.add_directory(self.primary_fs_name, self.primary_fs_id, dir_path)
+            except CommandFailedError as ce:
+                if ce.exitstatus != errno.EINVAL:
+                    raise RuntimeError(-errno.EINVAL, 'incorrect error code when adding a directory')
+            else:
+                raise RuntimeError(-errno.EINVAL, 'expected directory add to fail')
+
+        # cannot add ancestors or a subtree for an existing directory
+        check_add_command_failure('/')
+        check_add_command_failure('/d1')
+        check_add_command_failure('/d1/d2/d3')
+
+        # obviously, one can add a non-ancestor or non-subtree
+        self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d1/d4/')
+
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        self.mount_a.run_shell(["rm", "-rf", "d1"])
+
+    def test_cephfs_mirror_blocklist(self):
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+        # add peer
+        self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name)
+
+        res = self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}',
+                                         'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}')
+        peers_1 = set(res['peers'])
+
+        # fetch rados address for blacklist check
+        rados_inst = self.get_mirror_rados_addr(self.primary_fs_name, self.primary_fs_id)
+
+        # simulate non-responding mirror daemon by sending SIGSTOP
+        pid = self.get_mirror_daemon_pid()
+        log.debug(f'SIGSTOP to cephfs-mirror pid {pid}')
+        self.mount_a.run_shell(['kill', '-SIGSTOP', pid])
+
+        # wait for blocklist timeout -- the manager module would blocklist
+        # the mirror daemon
+        time.sleep(40)
+
+        # wake up the mirror daemon -- at this point, the daemon should know
+        # that it has been blocklisted
+        log.debug('SIGCONT to cephfs-mirror')
+        self.mount_a.run_shell(['kill', '-SIGCONT', pid])
+
+        # check if the rados addr is blocklisted
+        self.assertTrue(self.mds_cluster.is_addr_blocklisted(rados_inst))
+
+        # wait enough so that the mirror daemon restarts blocklisted instances
+        time.sleep(40)
+        rados_inst_new = self.get_mirror_rados_addr(self.primary_fs_name, self.primary_fs_id)
+
+        # and we should get a new rados instance
+        self.assertTrue(rados_inst != rados_inst_new)
+
+        # along with peers that were added
+        res = self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}',
+                                         'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}')
+        peers_2 = set(res['peers'])
+        self.assertTrue(peers_1, peers_2)
+
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+    def test_cephfs_mirror_stats(self):
+        log.debug('reconfigure client auth caps')
+        self.mds_cluster.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', "client.{0}".format(self.mount_b.client_id),
+                'mds', 'allow rw',
+                'mon', 'allow r',
+                'osd', 'allow rw pool={0}, allow rw pool={1}'.format(
+                    self.backup_fs.get_data_pool_name(), self.backup_fs.get_data_pool_name()))
+
+        log.debug(f'mounting filesystem {self.secondary_fs_name}')
+        self.mount_b.umount_wait()
+        self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name)
+
+        # create a bunch of files in a directory to snap
+        self.mount_a.run_shell(["mkdir", "d0"])
+        self.mount_a.create_n_files('d0/file', 50, sync=True)
+
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d0')
+        self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name)
+
+        # take a snapshot
+        self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"])
+
+        time.sleep(30)
+        self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
+                               "client.mirror_remote@ceph", '/d0', 'snap0', 1)
+        self.verify_snapshot('d0', 'snap0')
+
+        # some more IO
+        self.mount_a.run_shell(["mkdir", "d0/d00"])
+        self.mount_a.run_shell(["mkdir", "d0/d01"])
+
+        self.mount_a.create_n_files('d0/d00/more_file', 20, sync=True)
+        self.mount_a.create_n_files('d0/d01/some_more_file', 75, sync=True)
+
+        # take another snapshot
+        self.mount_a.run_shell(["mkdir", "d0/.snap/snap1"])
+
+        time.sleep(60)
+        self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
+                               "client.mirror_remote@ceph", '/d0', 'snap1', 2)
+        self.verify_snapshot('d0', 'snap1')
+
+        # delete a snapshot
+        self.mount_a.run_shell(["rmdir", "d0/.snap/snap0"])
+
+        time.sleep(10)
+        snap_list = self.mount_b.ls(path='d0/.snap')
+        self.assertTrue('snap0' not in snap_list)
+        self.check_peer_status_deleted_snap(self.primary_fs_name, self.primary_fs_id,
+                                            "client.mirror_remote@ceph", '/d0', 1)
+
+        # rename a snapshot
+        self.mount_a.run_shell(["mv", "d0/.snap/snap1", "d0/.snap/snap2"])
+
+        time.sleep(10)
+        snap_list = self.mount_b.ls(path='d0/.snap')
+        self.assertTrue('snap1' not in snap_list)
+        self.assertTrue('snap2' in snap_list)
+        self.check_peer_status_renamed_snap(self.primary_fs_name, self.primary_fs_id,
+                                            "client.mirror_remote@ceph", '/d0', 1)
+
+        self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d0')
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+    def test_cephfs_mirror_cancel_sync(self):
+        log.debug('reconfigure client auth caps')
+        self.mds_cluster.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', "client.{0}".format(self.mount_b.client_id),
+                'mds', 'allow rw',
+                'mon', 'allow r',
+                'osd', 'allow rw pool={0}, allow rw pool={1}'.format(
+                    self.backup_fs.get_data_pool_name(), self.backup_fs.get_data_pool_name()))
+
+        log.debug(f'mounting filesystem {self.secondary_fs_name}')
+        self.mount_b.umount_wait()
+        self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name)
+
+        # create a bunch of files in a directory to snap
+        self.mount_a.run_shell(["mkdir", "d0"])
+        for i in range(8):
+            filename = f'file.{i}'
+            self.mount_a.write_n_mb(os.path.join('d0', filename), 1024)
+
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d0')
+        self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name)
+
+        # take a snapshot
+        self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"])
+
+        time.sleep(10)
+        self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id,
+                                         "client.mirror_remote@ceph", '/d0', 'snap0')
+
+        self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d0')
+
+        snap_list = self.mount_b.ls(path='d0/.snap')
+        self.assertTrue('snap0' not in snap_list)
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+    def test_cephfs_mirror_restart_sync_on_blocklist(self):
+        log.debug('reconfigure client auth caps')
+        self.mds_cluster.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', "client.{0}".format(self.mount_b.client_id),
+                'mds', 'allow rw',
+                'mon', 'allow r',
+                'osd', 'allow rw pool={0}, allow rw pool={1}'.format(
+                    self.backup_fs.get_data_pool_name(), self.backup_fs.get_data_pool_name()))
+
+        log.debug(f'mounting filesystem {self.secondary_fs_name}')
+        self.mount_b.umount_wait()
+        self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name)
+
+        # create a bunch of files in a directory to snap
+        self.mount_a.run_shell(["mkdir", "d0"])
+        for i in range(8):
+            filename = f'file.{i}'
+            self.mount_a.write_n_mb(os.path.join('d0', filename), 1024)
+
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d0')
+        self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name)
+
+        # fetch rados address for blacklist check
+        rados_inst = self.get_mirror_rados_addr(self.primary_fs_name, self.primary_fs_id)
+
+        # take a snapshot
+        self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"])
+
+        time.sleep(10)
+        self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id,
+                                         "client.mirror_remote@ceph", '/d0', 'snap0')
+
+        # simulate non-responding mirror daemon by sending SIGSTOP
+        pid = self.get_mirror_daemon_pid()
+        log.debug(f'SIGSTOP to cephfs-mirror pid {pid}')
+        self.mount_a.run_shell(['kill', '-SIGSTOP', pid])
+
+        # wait for blocklist timeout -- the manager module would blocklist
+        # the mirror daemon
+        time.sleep(40)
+
+        # wake up the mirror daemon -- at this point, the daemon should know
+        # that it has been blocklisted
+        log.debug('SIGCONT to cephfs-mirror')
+        self.mount_a.run_shell(['kill', '-SIGCONT', pid])
+
+        # check if the rados addr is blocklisted
+        self.assertTrue(self.mds_cluster.is_addr_blocklisted(rados_inst))
+
+        time.sleep(500)
+        self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
+                               "client.mirror_remote@ceph", '/d0', 'snap0', expected_snap_count=1)
+        self.verify_snapshot('d0', 'snap0')
+
+        self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d0')
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+    def test_cephfs_mirror_failed_sync_with_correction(self):
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name)
+
+        # add a non-existent directory for synchronization
+        self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d0')
+
+        # wait for mirror daemon to mark it the directory as failed
+        time.sleep(120)
+        self.verify_failed_directory(self.primary_fs_name, self.primary_fs_id,
+                                     "client.mirror_remote@ceph", '/d0')
+
+        # create the directory
+        self.mount_a.run_shell(["mkdir", "d0"])
+        self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"])
+
+        # wait for correction
+        time.sleep(120)
+        self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
+                               "client.mirror_remote@ceph", '/d0', 'snap0', 1)
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+    def test_cephfs_mirror_service_daemon_status(self):
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name)
+
+        time.sleep(30)
+        status = self.get_mirror_daemon_status()
+
+        # assumption for this test: mirroring enabled for a single filesystem w/ single
+        # peer
+
+        # we have not added any directories
+        peer = status['filesystems'][0]['peers'][0]
+        self.assertEquals(status['filesystems'][0]['directory_count'], 0)
+        self.assertEquals(peer['stats']['failure_count'], 0)
+        self.assertEquals(peer['stats']['recovery_count'], 0)
+
+        # add a non-existent directory for synchronization -- check if its reported
+        # in daemon stats
+        self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d0')
+
+        time.sleep(120)
+        status = self.get_mirror_daemon_status()
+        # we added one
+        peer = status['filesystems'][0]['peers'][0]
+        self.assertEquals(status['filesystems'][0]['directory_count'], 1)
+        # failure count should be reflected
+        self.assertEquals(peer['stats']['failure_count'], 1)
+        self.assertEquals(peer['stats']['recovery_count'], 0)
+
+        # create the directory, mirror daemon would recover
+        self.mount_a.run_shell(["mkdir", "d0"])
+
+        time.sleep(120)
+        status = self.get_mirror_daemon_status()
+        peer = status['filesystems'][0]['peers'][0]
+        self.assertEquals(status['filesystems'][0]['directory_count'], 1)
+        # failure and recovery count should be reflected
+        self.assertEquals(peer['stats']['failure_count'], 1)
+        self.assertEquals(peer['stats']['recovery_count'], 1)
+
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+    def test_mirroring_init_failure(self):
+        """Test mirror daemon init failure"""
+
+        # disable mgr mirroring plugin as it would try to load dir map on
+        # on mirroring enabled for a filesystem (an throw up erorrs in
+        # the logs)
+        self.disable_mirroring_module()
+
+        # enable mirroring through mon interface -- this should result in the mirror daemon
+        # failing to enable mirroring due to absence of `cephfs_mirorr` index object.
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "mirror", "enable", self.primary_fs_name)
+
+        with safe_while(sleep=5, tries=10, action='wait for failed state') as proceed:
+            while proceed():
+                try:
+                    # verify via asok
+                    res = self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}',
+                                                     'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}')
+                    if not 'state' in res:
+                        return
+                    self.assertTrue(res['state'] == "failed")
+                    return True
+                except:
+                    pass
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "mirror", "disable", self.primary_fs_name)
+        time.sleep(10)
+        # verify via asok
+        try:
+            self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}',
+                                       'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}')
+        except CommandFailedError:
+            pass
+        else:
+            raise RuntimeError('expected admin socket to be unavailable')
+
+    def test_mirroring_init_failure_with_recovery(self):
+        """Test if the mirror daemon can recover from a init failure"""
+
+        # disable mgr mirroring plugin as it would try to load dir map on
+        # on mirroring enabled for a filesystem (an throw up erorrs in
+        # the logs)
+        self.disable_mirroring_module()
+
+        # enable mirroring through mon interface -- this should result in the mirror daemon
+        # failing to enable mirroring due to absence of `cephfs_mirror` index object.
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "mirror", "enable", self.primary_fs_name)
+        # need safe_while since non-failed status pops up as mirroring is restarted
+        # internally in mirror daemon.
+        with safe_while(sleep=5, tries=20, action='wait for failed state') as proceed:
+            while proceed():
+                try:
+                    # verify via asok
+                    res = self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}',
+                                                     'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}')
+                    if not 'state' in res:
+                        return
+                    self.assertTrue(res['state'] == "failed")
+                    return True
+                except:
+                    pass
+
+        # create the index object and check daemon recovery
+        try:
+            p = self.mount_a.client_remote.run(args=['rados', '-p', self.fs.metadata_pool_name, 'create', 'cephfs_mirror'],
+                                               stdout=StringIO(), stderr=StringIO(), timeout=30,
+                                               check_status=True, label="create index object")
+            p.wait()
+        except CommandFailedError as ce:
+            log.warn(f'mirror daemon command to create mirror index object failed: {ce}')
+            raise
+        time.sleep(30)
+        res = self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}',
+                                         'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}')
+        self.assertTrue(res['peers'] == {})
+        self.assertTrue(res['snap_dirs']['dir_count'] == 0)
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "mirror", "disable", self.primary_fs_name)
+        time.sleep(10)
+        # verify via asok
+        try:
+            self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}',
+                                       'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}')
+        except CommandFailedError:
+            pass
+        else:
+            raise RuntimeError('expected admin socket to be unavailable')
+
+    def test_cephfs_mirror_peer_bootstrap(self):
+        """Test importing peer bootstrap token"""
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+        # create a bootstrap token for the peer
+        bootstrap_token = self.bootstrap_peer(self.secondary_fs_name, "client.mirror_peer_bootstrap", "site-remote")
+
+        # import the peer via bootstrap token
+        self.import_peer(self.primary_fs_name, bootstrap_token)
+        time.sleep(10)
+        self.verify_peer_added(self.primary_fs_name, self.primary_fs_id, "client.mirror_peer_bootstrap@site-remote",
+                               self.secondary_fs_name)
+
+        # verify via peer_list interface
+        peer_uuid = self.get_peer_uuid("client.mirror_peer_bootstrap@site-remote")
+        res = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "peer_list", self.primary_fs_name))
+        self.assertTrue(peer_uuid in res)
+        self.assertTrue('mon_host' in res[peer_uuid] and res[peer_uuid]['mon_host'] != '')
+
+        # remove peer
+        self.peer_remove(self.primary_fs_name, self.primary_fs_id, "client.mirror_peer_bootstrap@site-remote")
+        # disable mirroring
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+    def test_cephfs_mirror_symlink_sync(self):
+        log.debug('reconfigure client auth caps')
+        self.mds_cluster.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', "client.{0}".format(self.mount_b.client_id),
+                'mds', 'allow rw',
+                'mon', 'allow r',
+                'osd', 'allow rw pool={0}, allow rw pool={1}'.format(
+                    self.backup_fs.get_data_pool_name(), self.backup_fs.get_data_pool_name()))
+
+        log.debug(f'mounting filesystem {self.secondary_fs_name}')
+        self.mount_b.umount_wait()
+        self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name)
+
+        # create a bunch of files w/ symbolic links in a directory to snap
+        self.mount_a.run_shell(["mkdir", "d0"])
+        self.mount_a.create_n_files('d0/file', 10, sync=True)
+        self.mount_a.run_shell(["ln", "-s", "./file_0", "d0/sym_0"])
+        self.mount_a.run_shell(["ln", "-s", "./file_1", "d0/sym_1"])
+        self.mount_a.run_shell(["ln", "-s", "./file_2", "d0/sym_2"])
+
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d0')
+        self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name)
+
+        # take a snapshot
+        self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"])
+
+        time.sleep(30)
+        self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
+                               "client.mirror_remote@ceph", '/d0', 'snap0', 1)
+        self.verify_snapshot('d0', 'snap0')
+
+        self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d0')
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+    def test_cephfs_mirror_with_parent_snapshot(self):
+        """Test snapshot synchronization with parent directory snapshots"""
+        self.mount_a.run_shell(["mkdir", "-p", "d0/d1/d2/d3"])
+
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d0/d1/d2/d3')
+        self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name)
+
+        # take a snapshot
+        self.mount_a.run_shell(["mkdir", "d0/d1/d2/d3/.snap/snap0"])
+
+        time.sleep(30)
+        self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
+                               "client.mirror_remote@ceph", '/d0/d1/d2/d3', 'snap0', 1)
+
+        # create snapshots in parent directories
+        self.mount_a.run_shell(["mkdir", "d0/.snap/snap_d0"])
+        self.mount_a.run_shell(["mkdir", "d0/d1/.snap/snap_d1"])
+        self.mount_a.run_shell(["mkdir", "d0/d1/d2/.snap/snap_d2"])
+
+        # try syncing more snapshots
+        self.mount_a.run_shell(["mkdir", "d0/d1/d2/d3/.snap/snap1"])
+        time.sleep(30)
+        self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
+                               "client.mirror_remote@ceph", '/d0/d1/d2/d3', 'snap1', 2)
+
+        self.mount_a.run_shell(["rmdir", "d0/d1/d2/d3/.snap/snap0"])
+        self.mount_a.run_shell(["rmdir", "d0/d1/d2/d3/.snap/snap1"])
+        time.sleep(15)
+        self.check_peer_status_deleted_snap(self.primary_fs_name, self.primary_fs_id,
+                                            "client.mirror_remote@ceph", '/d0/d1/d2/d3', 2)
+
+        self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d0/d1/d2/d3')
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+    def test_cephfs_mirror_remove_on_stall(self):
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+        # fetch rados address for blacklist check
+        rados_inst = self.get_mirror_rados_addr(self.primary_fs_name, self.primary_fs_id)
+
+        # simulate non-responding mirror daemon by sending SIGSTOP
+        pid = self.get_mirror_daemon_pid()
+        log.debug(f'SIGSTOP to cephfs-mirror pid {pid}')
+        self.mount_a.run_shell(['kill', '-SIGSTOP', pid])
+
+        # wait for blocklist timeout -- the manager module would blocklist
+        # the mirror daemon
+        time.sleep(40)
+
+        # make sure the rados addr is blocklisted
+        self.assertTrue(self.mds_cluster.is_addr_blocklisted(rados_inst))
+
+        # now we are sure that there are no "active" mirror daemons -- add a directory path.
+        dir_path_p = "/d0/d1"
+        dir_path = "/d0/d1/d2"
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "add", self.primary_fs_name, dir_path)
+
+        time.sleep(10)
+        # this uses an undocumented interface to get dirpath map state
+        res_json = self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "dirmap", self.primary_fs_name, dir_path)
+        res = json.loads(res_json)
+        # there are no mirror daemons
+        self.assertTrue(res['state'], 'stalled')
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "remove", self.primary_fs_name, dir_path)
+
+        time.sleep(10)
+        try:
+            self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "dirmap", self.primary_fs_name, dir_path)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOENT:
+                raise RuntimeError('invalid errno when checking dirmap status for non-existent directory')
+        else:
+            raise RuntimeError('incorrect errno when checking dirmap state for non-existent directory')
+
+        # adding a parent directory should be allowed
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "add", self.primary_fs_name, dir_path_p)
+
+        time.sleep(10)
+        # however, this directory path should get stalled too
+        res_json = self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "dirmap", self.primary_fs_name, dir_path_p)
+        res = json.loads(res_json)
+        # there are no mirror daemons
+        self.assertTrue(res['state'], 'stalled')
+
+        # wake up the mirror daemon -- at this point, the daemon should know
+        # that it has been blocklisted
+        log.debug('SIGCONT to cephfs-mirror')
+        self.mount_a.run_shell(['kill', '-SIGCONT', pid])
+
+        # wait for restart mirror on blocklist
+        time.sleep(60)
+        res_json = self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "dirmap", self.primary_fs_name, dir_path_p)
+        res = json.loads(res_json)
+        # there are no mirror daemons
+        self.assertTrue(res['state'], 'mapped')
+
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+    def test_cephfs_mirror_incremental_sync(self):
+        """ Test incremental snapshot synchronization (based on mtime differences)."""
+        log.debug('reconfigure client auth caps')
+        self.mds_cluster.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', "client.{0}".format(self.mount_b.client_id),
+            'mds', 'allow rw',
+            'mon', 'allow r',
+            'osd', 'allow rw pool={0}, allow rw pool={1}'.format(
+                self.backup_fs.get_data_pool_name(), self.backup_fs.get_data_pool_name()))
+        log.debug(f'mounting filesystem {self.secondary_fs_name}')
+        self.mount_b.umount_wait()
+        self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name)
+
+        repo = 'ceph-qa-suite'
+        repo_dir = 'ceph_repo'
+        repo_path = f'{repo_dir}/{repo}'
+
+        def clone_repo():
+            self.mount_a.run_shell([
+                'git', 'clone', '--branch', 'giant',
+                f'http://github.com/ceph/{repo}', repo_path])
+
+        def exec_git_cmd(cmd_list):
+            self.mount_a.run_shell(['git', '--git-dir', f'{self.mount_a.mountpoint}/{repo_path}/.git', *cmd_list])
+
+        self.mount_a.run_shell(["mkdir", repo_dir])
+        clone_repo()
+
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name)
+
+        self.add_directory(self.primary_fs_name, self.primary_fs_id, f'/{repo_path}')
+        self.mount_a.run_shell(['mkdir', f'{repo_path}/.snap/snap_a'])
+
+        # full copy, takes time
+        time.sleep(500)
+        self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
+                               "client.mirror_remote@ceph", f'/{repo_path}', 'snap_a', 1)
+        self.verify_snapshot(repo_path, 'snap_a')
+
+        # create some diff
+        num = random.randint(5, 20)
+        log.debug(f'resetting to HEAD~{num}')
+        exec_git_cmd(["reset", "--hard", f'HEAD~{num}'])
+
+        self.mount_a.run_shell(['mkdir', f'{repo_path}/.snap/snap_b'])
+        # incremental copy, should be fast
+        time.sleep(180)
+        self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
+                               "client.mirror_remote@ceph", f'/{repo_path}', 'snap_b', 2)
+        self.verify_snapshot(repo_path, 'snap_b')
+
+        # diff again, this time back to HEAD
+        log.debug('resetting to HEAD')
+        exec_git_cmd(["pull"])
+
+        self.mount_a.run_shell(['mkdir', f'{repo_path}/.snap/snap_c'])
+        # incremental copy, should be fast
+        time.sleep(180)
+        self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
+                               "client.mirror_remote@ceph", f'/{repo_path}', 'snap_c', 3)
+        self.verify_snapshot(repo_path, 'snap_c')
+
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+    def test_cephfs_mirror_incremental_sync_with_type_mixup(self):
+        """ Test incremental snapshot synchronization with file type changes.
+
+        The same filename exist as a different type in subsequent snapshot.
+        This verifies if the mirror daemon can identify file type mismatch and
+        sync snapshots.
+
+              \    snap_0       snap_1      snap_2      snap_3
+               \-----------------------------------------------
+        file_x |   reg          sym         dir         reg
+               |
+        file_y |   dir          reg         sym         dir
+               |
+        file_z |   sym          dir         reg         sym
+        """
+        log.debug('reconfigure client auth caps')
+        self.mds_cluster.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', "client.{0}".format(self.mount_b.client_id),
+                'mds', 'allow rw',
+                'mon', 'allow r',
+                'osd', 'allow rw pool={0}, allow rw pool={1}'.format(
+                    self.backup_fs.get_data_pool_name(), self.backup_fs.get_data_pool_name()))
+        log.debug(f'mounting filesystem {self.secondary_fs_name}')
+        self.mount_b.umount_wait()
+        self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name)
+
+        typs = deque(['reg', 'dir', 'sym'])
+        def cleanup_and_create_with_type(dirname, fnames):
+            self.mount_a.run_shell_payload(f"rm -rf {dirname}/*")
+            fidx = 0
+            for t in typs:
+                fname = f'{dirname}/{fnames[fidx]}'
+                log.debug(f'file: {fname} type: {t}')
+                if t == 'reg':
+                    self.mount_a.run_shell(["touch", fname])
+                    self.mount_a.write_file(fname, data=fname)
+                elif t == 'dir':
+                    self.mount_a.run_shell(["mkdir", fname])
+                elif t == 'sym':
+                    # verify ELOOP in mirror daemon
+                    self.mount_a.run_shell(["ln", "-s", "..", fname])
+                fidx += 1
+
+        def verify_types(dirname, fnames, snap_name):
+            tidx = 0
+            for fname in fnames:
+                t = self.mount_b.run_shell_payload(f"stat -c %F {dirname}/.snap/{snap_name}/{fname}").stdout.getvalue().strip()
+                if typs[tidx] == 'reg':
+                    self.assertEquals('regular file', t)
+                elif typs[tidx] == 'dir':
+                    self.assertEquals('directory', t)
+                elif typs[tidx] == 'sym':
+                    self.assertEquals('symbolic link', t)
+                tidx += 1
+
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name)
+
+        self.mount_a.run_shell(["mkdir", "d0"])
+        self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d0')
+
+        fnames = ['file_x', 'file_y', 'file_z']
+        turns = 0
+        while turns != len(typs):
+            snapname = f'snap_{turns}'
+            cleanup_and_create_with_type('d0', fnames)
+            self.mount_a.run_shell(['mkdir', f'd0/.snap/{snapname}'])
+            time.sleep(30)
+            self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
+                                   "client.mirror_remote@ceph", '/d0', snapname, turns+1)
+            verify_types('d0', fnames, snapname)
+            # next type
+            typs.rotate(1)
+            turns += 1
+
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+    def test_cephfs_mirror_sync_with_purged_snapshot(self):
+        """Test snapshot synchronization in midst of snapshot deletes.
+
+        Deleted the previous snapshot when the mirror daemon is figuring out
+        incremental differences between current and previous snaphot. The
+        mirror daemon should identify the purge and switch to using remote
+        comparison to sync the snapshot (in the next iteration of course).
+        """
+
+        log.debug('reconfigure client auth caps')
+        self.mds_cluster.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', "client.{0}".format(self.mount_b.client_id),
+            'mds', 'allow rw',
+            'mon', 'allow r',
+            'osd', 'allow rw pool={0}, allow rw pool={1}'.format(
+                self.backup_fs.get_data_pool_name(), self.backup_fs.get_data_pool_name()))
+        log.debug(f'mounting filesystem {self.secondary_fs_name}')
+        self.mount_b.umount_wait()
+        self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name)
+
+        repo = 'ceph-qa-suite'
+        repo_dir = 'ceph_repo'
+        repo_path = f'{repo_dir}/{repo}'
+
+        def clone_repo():
+            self.mount_a.run_shell([
+                'git', 'clone', '--branch', 'giant',
+                f'http://github.com/ceph/{repo}', repo_path])
+
+        def exec_git_cmd(cmd_list):
+            self.mount_a.run_shell(['git', '--git-dir', f'{self.mount_a.mountpoint}/{repo_path}/.git', *cmd_list])
+
+        self.mount_a.run_shell(["mkdir", repo_dir])
+        clone_repo()
+
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name)
+
+        self.add_directory(self.primary_fs_name, self.primary_fs_id, f'/{repo_path}')
+        self.mount_a.run_shell(['mkdir', f'{repo_path}/.snap/snap_a'])
+
+        # full copy, takes time
+        time.sleep(500)
+        self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
+                               "client.mirror_remote@ceph", f'/{repo_path}', 'snap_a', 1)
+        self.verify_snapshot(repo_path, 'snap_a')
+
+        # create some diff
+        num = random.randint(60, 100)
+        log.debug(f'resetting to HEAD~{num}')
+        exec_git_cmd(["reset", "--hard", f'HEAD~{num}'])
+
+        self.mount_a.run_shell(['mkdir', f'{repo_path}/.snap/snap_b'])
+
+        time.sleep(15)
+        self.mount_a.run_shell(['rmdir', f'{repo_path}/.snap/snap_a'])
+
+        # incremental copy but based on remote dir_root
+        time.sleep(300)
+        self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
+                               "client.mirror_remote@ceph", f'/{repo_path}', 'snap_b', 2)
+        self.verify_snapshot(repo_path, 'snap_b')
+
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+    def test_cephfs_mirror_peer_add_primary(self):
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name)
+
+        # try adding the primary file system as a peer to secondary file
+        # system
+        try:
+            self.peer_add(self.secondary_fs_name, self.secondary_fs_id, "client.mirror_remote@ceph", self.primary_fs_name)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EINVAL:
+                raise RuntimeError('invalid errno when adding a primary file system')
+        else:
+            raise RuntimeError('adding peer should fail')
+
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+
+    def test_cephfs_mirror_cancel_mirroring_and_readd(self):
+        """
+        Test adding a directory path for synchronization post removal of already added directory paths
+
+        ... to ensure that synchronization of the newly added directory path functions
+        as expected. Note that we schedule three (3) directories for mirroring to ensure
+        that all replayer threads (3 by default) in the mirror daemon are busy.
+        """
+        log.debug('reconfigure client auth caps')
+        self.mds_cluster.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', "client.{0}".format(self.mount_b.client_id),
+                'mds', 'allow rw',
+                'mon', 'allow r',
+                'osd', 'allow rw pool={0}, allow rw pool={1}'.format(
+                    self.backup_fs.get_data_pool_name(), self.backup_fs.get_data_pool_name()))
+
+        log.debug(f'mounting filesystem {self.secondary_fs_name}')
+        self.mount_b.umount_wait()
+        self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name)
+
+        # create a bunch of files in a directory to snap
+        self.mount_a.run_shell(["mkdir", "d0"])
+        self.mount_a.run_shell(["mkdir", "d1"])
+        self.mount_a.run_shell(["mkdir", "d2"])
+        for i in range(4):
+            filename = f'file.{i}'
+            self.mount_a.write_n_mb(os.path.join('d0', filename), 1024)
+            self.mount_a.write_n_mb(os.path.join('d1', filename), 1024)
+            self.mount_a.write_n_mb(os.path.join('d2', filename), 1024)
+
+        log.debug('enabling mirroring')
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        log.debug('adding directory paths')
+        self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d0')
+        self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d1')
+        self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d2')
+        self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name)
+
+        # take snapshots
+        log.debug('taking snapshots')
+        self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"])
+        self.mount_a.run_shell(["mkdir", "d1/.snap/snap0"])
+        self.mount_a.run_shell(["mkdir", "d2/.snap/snap0"])
+
+        time.sleep(10)
+        log.debug('checking snap in progress')
+        self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id,
+                                         "client.mirror_remote@ceph", '/d0', 'snap0')
+        self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id,
+                                         "client.mirror_remote@ceph", '/d1', 'snap0')
+        self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id,
+                                         "client.mirror_remote@ceph", '/d2', 'snap0')
+
+        log.debug('removing directories 1')
+        self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d0')
+        log.debug('removing directories 2')
+        self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d1')
+        log.debug('removing directories 3')
+        self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d2')
+
+        log.debug('removing snapshots')
+        self.mount_a.run_shell(["rmdir", "d0/.snap/snap0"])
+        self.mount_a.run_shell(["rmdir", "d1/.snap/snap0"])
+        self.mount_a.run_shell(["rmdir", "d2/.snap/snap0"])
+
+        for i in range(4):
+            filename = f'file.{i}'
+            log.debug(f'deleting {filename}')
+            self.mount_a.run_shell(["rm", "-f", os.path.join('d0', filename)])
+            self.mount_a.run_shell(["rm", "-f", os.path.join('d1', filename)])
+            self.mount_a.run_shell(["rm", "-f", os.path.join('d2', filename)])
+
+        log.debug('creating new files...')
+        self.mount_a.create_n_files('d0/file', 50, sync=True)
+        self.mount_a.create_n_files('d1/file', 50, sync=True)
+        self.mount_a.create_n_files('d2/file', 50, sync=True)
+
+        log.debug('adding directory paths')
+        self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d0')
+        self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d1')
+        self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d2')
+
+        log.debug('creating new snapshots...')
+        self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"])
+        self.mount_a.run_shell(["mkdir", "d1/.snap/snap0"])
+        self.mount_a.run_shell(["mkdir", "d2/.snap/snap0"])
+
+        time.sleep(60)
+        self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
+                               "client.mirror_remote@ceph", '/d0', 'snap0', 1)
+        self.verify_snapshot('d0', 'snap0')
+
+        self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
+                               "client.mirror_remote@ceph", '/d1', 'snap0', 1)
+        self.verify_snapshot('d1', 'snap0')
+
+        self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
+                               "client.mirror_remote@ceph", '/d2', 'snap0', 1)
+        self.verify_snapshot('d2', 'snap0')
+
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
diff --git a/qa/tasks/cephfs/test_misc.py b/qa/tasks/cephfs/test_misc.py
new file mode 100644
index 000000000..0bd8ad621
--- /dev/null
+++ b/qa/tasks/cephfs/test_misc.py
@@ -0,0 +1,416 @@
+from io import StringIO
+
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.orchestra.run import CommandFailedError
+from textwrap import dedent
+from threading import Thread
+import errno
+import time
+import json
+import logging
+import os
+
+log = logging.getLogger(__name__)
+
+class TestMisc(CephFSTestCase):
+    CLIENTS_REQUIRED = 2
+
+    def test_statfs_on_deleted_fs(self):
+        """
+        That statfs does not cause monitors to SIGSEGV after fs deletion.
+        """
+
+        self.mount_b.umount_wait()
+        self.mount_a.run_shell_payload("stat -f .")
+        self.fs.delete_all_filesystems()
+        # This will hang either way, run in background.
+        p = self.mount_a.run_shell_payload("stat -f .", wait=False, timeout=60, check_status=False)
+        time.sleep(30)
+        self.assertFalse(p.finished)
+        # the process is stuck in uninterruptible sleep, just kill the mount
+        self.mount_a.umount_wait(force=True)
+        p.wait()
+
+    def test_getattr_caps(self):
+        """
+        Check if MDS recognizes the 'mask' parameter of open request.
+        The parameter allows client to request caps when opening file
+        """
+
+        if not isinstance(self.mount_a, FuseMount):
+            self.skipTest("Require FUSE client")
+
+        # Enable debug. Client will requests CEPH_CAP_XATTR_SHARED
+        # on lookup/open
+        self.mount_b.umount_wait()
+        self.set_conf('client', 'client debug getattr caps', 'true')
+        self.mount_b.mount_wait()
+
+        # create a file and hold it open. MDS will issue CEPH_CAP_EXCL_*
+        # to mount_a
+        p = self.mount_a.open_background("testfile")
+        self.mount_b.wait_for_visible("testfile")
+
+        # this triggers a lookup request and an open request. The debug
+        # code will check if lookup/open reply contains xattrs
+        self.mount_b.run_shell(["cat", "testfile"])
+
+        self.mount_a.kill_background(p)
+
+    def test_root_rctime(self):
+        """
+        Check that the root inode has a non-default rctime on startup.
+        """
+
+        t = time.time()
+        rctime = self.mount_a.getfattr(".", "ceph.dir.rctime")
+        log.info("rctime = {}".format(rctime))
+        self.assertGreaterEqual(float(rctime), t - 10)
+
+    def test_fs_new(self):
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        data_pool_name = self.fs.get_data_pool_name()
+
+        self.fs.fail()
+
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
+                                            '--yes-i-really-mean-it')
+
+        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
+                                            self.fs.metadata_pool_name,
+                                            self.fs.metadata_pool_name,
+                                            '--yes-i-really-really-mean-it')
+        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
+                                            self.fs.metadata_pool_name,
+                                            '--pg_num_min', str(self.fs.pg_num_min))
+
+        # insert a garbage object
+        self.fs.radosm(["put", "foo", "-"], stdin=StringIO("bar"))
+
+        def get_pool_df(fs, name):
+            try:
+                return fs.get_pool_df(name)['objects'] > 0
+            except RuntimeError:
+                return False
+
+        self.wait_until_true(lambda: get_pool_df(self.fs, self.fs.metadata_pool_name), timeout=30)
+
+        try:
+            self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name,
+                                                self.fs.metadata_pool_name,
+                                                data_pool_name)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.EINVAL)
+        else:
+            raise AssertionError("Expected EINVAL")
+
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name,
+                                            self.fs.metadata_pool_name,
+                                            data_pool_name, "--force")
+
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'fail', self.fs.name)
+
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
+                                            '--yes-i-really-mean-it')
+
+        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
+                                            self.fs.metadata_pool_name,
+                                            self.fs.metadata_pool_name,
+                                            '--yes-i-really-really-mean-it')
+        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
+                                            self.fs.metadata_pool_name,
+                                            '--pg_num_min', str(self.fs.pg_num_min))
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name,
+                                            self.fs.metadata_pool_name,
+                                            data_pool_name)
+
+    def test_cap_revoke_nonresponder(self):
+        """
+        Check that a client is evicted if it has not responded to cap revoke
+        request for configured number of seconds.
+        """
+        session_timeout = self.fs.get_var("session_timeout")
+        eviction_timeout = session_timeout / 2.0
+
+        self.fs.mds_asok(['config', 'set', 'mds_cap_revoke_eviction_timeout',
+                          str(eviction_timeout)])
+
+        cap_holder = self.mount_a.open_background()
+
+        # Wait for the file to be visible from another client, indicating
+        # that mount_a has completed its network ops
+        self.mount_b.wait_for_visible()
+
+        # Simulate client death
+        self.mount_a.suspend_netns()
+
+        try:
+            # The waiter should get stuck waiting for the capability
+            # held on the MDS by the now-dead client A
+            cap_waiter = self.mount_b.write_background()
+
+            a = time.time()
+            time.sleep(eviction_timeout)
+            cap_waiter.wait()
+            b = time.time()
+            cap_waited = b - a
+            log.info("cap_waiter waited {0}s".format(cap_waited))
+
+            # check if the cap is transferred before session timeout kicked in.
+            # this is a good enough check to ensure that the client got evicted
+            # by the cap auto evicter rather than transitioning to stale state
+            # and then getting evicted.
+            self.assertLess(cap_waited, session_timeout,
+                            "Capability handover took {0}, expected less than {1}".format(
+                                cap_waited, session_timeout
+                            ))
+
+            self.assertTrue(self.mds_cluster.is_addr_blocklisted(
+                self.mount_a.get_global_addr()))
+            self.mount_a._kill_background(cap_holder)
+        finally:
+            self.mount_a.resume_netns()
+
+    def test_filtered_df(self):
+        pool_name = self.fs.get_data_pool_name()
+        raw_df = self.fs.get_pool_df(pool_name)
+        raw_avail = float(raw_df["max_avail"])
+        out = self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'get',
+                                                  pool_name, 'size',
+                                                  '-f', 'json-pretty')
+        _ = json.loads(out)
+
+        proc = self.mount_a.run_shell(['df', '.'])
+        output = proc.stdout.getvalue()
+        fs_avail = output.split('\n')[1].split()[3]
+        fs_avail = float(fs_avail) * 1024
+
+        ratio = raw_avail / fs_avail
+        assert 0.9 < ratio < 1.1
+
+    def test_dump_inode(self):
+        info = self.fs.mds_asok(['dump', 'inode', '1'])
+        assert(info['path'] == "/")
+
+    def test_dump_inode_hexademical(self):
+        self.mount_a.run_shell(["mkdir", "-p", "foo"])
+        ino = self.mount_a.path_to_ino("foo")
+        assert type(ino) is int
+        info = self.fs.mds_asok(['dump', 'inode', hex(ino)])
+        assert info['path'] == "/foo"
+
+
+    def _test_sync_stuck_for_around_5s(self, dir_path, file_sync=False):
+        self.mount_a.run_shell(["mkdir", dir_path])
+
+        sync_dir_pyscript = dedent("""
+                import os
+
+                path = "{path}"
+                dfd = os.open(path, os.O_DIRECTORY)
+                os.fsync(dfd)
+                os.close(dfd)
+            """.format(path=dir_path))
+
+        # run create/delete directories and test the sync time duration
+        for i in range(300):
+            for j in range(5):
+                self.mount_a.run_shell(["mkdir", os.path.join(dir_path, f"{i}_{j}")])
+            start = time.time()
+            if file_sync:
+                self.mount_a.run_shell(['python3', '-c', sync_dir_pyscript])
+            else:
+                self.mount_a.run_shell(["sync"])
+            duration = time.time() - start
+            log.info(f"sync mkdir i = {i}, duration = {duration}")
+            self.assertLess(duration, 4)
+
+            for j in range(5):
+                self.mount_a.run_shell(["rm", "-rf", os.path.join(dir_path, f"{i}_{j}")])
+            start = time.time()
+            if file_sync:
+                self.mount_a.run_shell(['python3', '-c', sync_dir_pyscript])
+            else:
+                self.mount_a.run_shell(["sync"])
+            duration = time.time() - start
+            log.info(f"sync rmdir i = {i}, duration = {duration}")
+            self.assertLess(duration, 4)
+
+        self.mount_a.run_shell(["rm", "-rf", dir_path])
+
+    def test_filesystem_sync_stuck_for_around_5s(self):
+        """
+        To check whether the fsync will be stuck to wait for the mdlog to be
+        flushed for at most 5 seconds.
+        """
+
+        dir_path = "filesystem_sync_do_not_wait_mdlog_testdir"
+        self._test_sync_stuck_for_around_5s(dir_path)
+
+    def test_file_sync_stuck_for_around_5s(self):
+        """
+        To check whether the filesystem sync will be stuck to wait for the
+        mdlog to be flushed for at most 5 seconds.
+        """
+
+        dir_path = "file_sync_do_not_wait_mdlog_testdir"
+        self._test_sync_stuck_for_around_5s(dir_path, True)
+
+    def test_file_filesystem_sync_crash(self):
+        """
+        To check whether the kernel crashes when doing the file/filesystem sync.
+        """
+
+        stop_thread = False
+        dir_path = "file_filesystem_sync_crash_testdir"
+        self.mount_a.run_shell(["mkdir", dir_path])
+
+        def mkdir_rmdir_thread(mount, path):
+            #global stop_thread
+
+            log.info(" mkdir_rmdir_thread starting...")
+            num = 0
+            while not stop_thread:
+                n = num
+                m = num
+                for __ in range(10):
+                    mount.run_shell(["mkdir", os.path.join(path, f"{n}")])
+                    n += 1
+                for __ in range(10):
+                    mount.run_shell(["rm", "-rf", os.path.join(path, f"{m}")])
+                    m += 1
+                num += 10
+            log.info(" mkdir_rmdir_thread stopped")
+
+        def filesystem_sync_thread(mount, path):
+            #global stop_thread
+
+            log.info(" filesystem_sync_thread starting...")
+            while not stop_thread:
+                mount.run_shell(["sync"])
+            log.info(" filesystem_sync_thread stopped")
+
+        def file_sync_thread(mount, path):
+            #global stop_thread
+
+            log.info(" file_sync_thread starting...")
+            pyscript = dedent("""
+                    import os
+
+                    path = "{path}"
+                    dfd = os.open(path, os.O_DIRECTORY)
+                    os.fsync(dfd)
+                    os.close(dfd)
+                """.format(path=path))
+
+            while not stop_thread:
+                mount.run_shell(['python3', '-c', pyscript])
+            log.info(" file_sync_thread stopped")
+
+        td1 = Thread(target=mkdir_rmdir_thread, args=(self.mount_a, dir_path,))
+        td2 = Thread(target=filesystem_sync_thread, args=(self.mount_a, dir_path,))
+        td3 = Thread(target=file_sync_thread, args=(self.mount_a, dir_path,))
+
+        td1.start()
+        td2.start()
+        td3.start()
+        time.sleep(1200) # run 20 minutes
+        stop_thread = True
+        td1.join()
+        td2.join()
+        td3.join()
+        self.mount_a.run_shell(["rm", "-rf", dir_path])
+
+
+class TestCacheDrop(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+
+    def _run_drop_cache_cmd(self, timeout=None):
+        result = None
+        args = ["cache", "drop"]
+        if timeout is not None:
+            args.append(str(timeout))
+        result = self.fs.rank_tell(args)
+        return result
+
+    def _setup(self, max_caps=20, threshold=400):
+        # create some files
+        self.mount_a.create_n_files("dc-dir/dc-file", 1000, sync=True)
+
+        # Reduce this so the MDS doesn't rkcall the maximum for simple tests
+        self.fs.rank_asok(['config', 'set', 'mds_recall_max_caps', str(max_caps)])
+        self.fs.rank_asok(['config', 'set', 'mds_recall_max_decay_threshold', str(threshold)])
+
+    def test_drop_cache_command(self):
+        """
+        Basic test for checking drop cache command.
+        Confirm it halts without a timeout.
+        Note that the cache size post trimming is not checked here.
+        """
+        mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
+        self._setup()
+        result = self._run_drop_cache_cmd()
+        self.assertEqual(result['client_recall']['return_code'], 0)
+        self.assertEqual(result['flush_journal']['return_code'], 0)
+        # It should take at least 1 second
+        self.assertGreater(result['duration'], 1)
+        self.assertGreaterEqual(result['trim_cache']['trimmed'], 1000-2*mds_min_caps_per_client)
+
+    def test_drop_cache_command_timeout(self):
+        """
+        Basic test for checking drop cache command.
+        Confirm recall halts early via a timeout.
+        Note that the cache size post trimming is not checked here.
+        """
+        self._setup()
+        result = self._run_drop_cache_cmd(timeout=10)
+        self.assertEqual(result['client_recall']['return_code'], -errno.ETIMEDOUT)
+        self.assertEqual(result['flush_journal']['return_code'], 0)
+        self.assertGreater(result['duration'], 10)
+        self.assertGreaterEqual(result['trim_cache']['trimmed'], 100) # we did something, right?
+
+    def test_drop_cache_command_dead_timeout(self):
+        """
+        Check drop cache command with non-responding client using tell
+        interface. Note that the cache size post trimming is not checked
+        here.
+        """
+        self._setup()
+        self.mount_a.suspend_netns()
+        # Note: recall is subject to the timeout. The journal flush will
+        # be delayed due to the client being dead.
+        result = self._run_drop_cache_cmd(timeout=5)
+        self.assertEqual(result['client_recall']['return_code'], -errno.ETIMEDOUT)
+        self.assertEqual(result['flush_journal']['return_code'], 0)
+        self.assertGreater(result['duration'], 5)
+        self.assertLess(result['duration'], 120)
+        # Note: result['trim_cache']['trimmed'] may be >0 because dropping the
+        # cache now causes the Locker to drive eviction of stale clients (a
+        # stale session will be autoclosed at mdsmap['session_timeout']). The
+        # particular operation causing this is journal flush which causes the
+        # MDS to wait wait for cap revoke.
+        #self.assertEqual(0, result['trim_cache']['trimmed'])
+        self.mount_a.resume_netns()
+
+    def test_drop_cache_command_dead(self):
+        """
+        Check drop cache command with non-responding client using tell
+        interface. Note that the cache size post trimming is not checked
+        here.
+        """
+        self._setup()
+        self.mount_a.suspend_netns()
+        result = self._run_drop_cache_cmd()
+        self.assertEqual(result['client_recall']['return_code'], 0)
+        self.assertEqual(result['flush_journal']['return_code'], 0)
+        self.assertGreater(result['duration'], 5)
+        self.assertLess(result['duration'], 120)
+        # Note: result['trim_cache']['trimmed'] may be >0 because dropping the
+        # cache now causes the Locker to drive eviction of stale clients (a
+        # stale session will be autoclosed at mdsmap['session_timeout']). The
+        # particular operation causing this is journal flush which causes the
+        # MDS to wait wait for cap revoke.
+        self.mount_a.resume_netns()
diff --git a/qa/tasks/cephfs/test_multifs_auth.py b/qa/tasks/cephfs/test_multifs_auth.py
new file mode 100644
index 000000000..b247dd8f5
--- /dev/null
+++ b/qa/tasks/cephfs/test_multifs_auth.py
@@ -0,0 +1,313 @@
+"""
+Test for Ceph clusters with multiple FSs.
+"""
+import logging
+
+from os.path import join as os_path_join
+
+# CapsHelper is subclassed from CephFSTestCase
+from tasks.cephfs.caps_helper import CapsHelper
+
+from teuthology.orchestra.run import CommandFailedError
+
+
+log = logging.getLogger(__name__)
+
+
+class TestMultiFS(CapsHelper):
+    client_id = 'testuser'
+    client_name = 'client.' + client_id
+    # one dedicated for each FS
+    MDSS_REQUIRED = 2
+    CLIENTS_REQUIRED = 2
+
+    def setUp(self):
+        super(TestMultiFS, self).setUp()
+
+        # we might have it - the client - if the same cluster was used for a
+        # different vstart_runner.py run.
+        self.run_cluster_cmd(f'auth rm {self.client_name}')
+
+        self.fs1 = self.fs
+        # After Octopus is EOL, we can remove this setting:
+        self.fs1.set_allow_multifs()
+        self.fs2 = self.mds_cluster.newfs(name='cephfs2', create=True)
+
+        # we'll reassign caps to client.1 so that it can operate with cephfs2
+        self.run_cluster_cmd(f'auth caps client.{self.mount_b.client_id} mon '
+                             f'"allow r" osd "allow rw '
+                             f'pool={self.fs2.get_data_pool_name()}" mds allow')
+        self.mount_b.remount(cephfs_name=self.fs2.name)
+
+
+class TestMONCaps(TestMultiFS):
+
+    def test_moncap_with_one_fs_names(self):
+        moncap = f'allow r fsname={self.fs1.name}'
+        keyring = self.setup_test_env(moncap)
+
+        self.run_mon_cap_tests(moncap, keyring)
+
+    def test_moncap_with_multiple_fs_names(self):
+        moncap = (f'allow r fsname={self.fs1.name}, '
+                  f'allow r fsname={self.fs2.name}')
+        keyring = self.setup_test_env(moncap)
+
+        self.run_mon_cap_tests(moncap, keyring)
+
+    def test_moncap_with_blanket_allow(self):
+        moncap = 'allow r'
+        keyring = self.setup_test_env(moncap)
+
+        self.run_mon_cap_tests(moncap, keyring)
+
+    def setup_test_env(self, moncap):
+        return self.create_client(self.client_id, moncap)
+
+
+#TODO: add tests for capsecs 'p' and 's'.
+class TestMDSCaps(TestMultiFS):
+    """
+    0. Have 2 FSs on Ceph cluster.
+    1. Create new files on both FSs.
+    2. Create a new client that has authorization for both FSs.
+    3. Remount the current mounts with this new client.
+    4. Test read and write on both FSs.
+    """
+    def test_rw_with_fsname_and_no_path_in_cap(self):
+        perm = 'rw'
+        filepaths, filedata, mounts = self.setup_test_env(perm, True)
+
+        self.run_mds_cap_tests(filepaths, filedata, mounts, perm)
+
+    def test_r_with_fsname_and_no_path_in_cap(self):
+        perm = 'r'
+        filepaths, filedata, mounts = self.setup_test_env(perm, True)
+
+        self.run_mds_cap_tests(filepaths, filedata, mounts, perm)
+
+    def test_rw_with_fsname_and_path_in_cap(self):
+        perm = 'rw'
+        filepaths, filedata, mounts = self.setup_test_env(perm, True,'dir1')
+
+        self.run_mds_cap_tests(filepaths, filedata, mounts, perm)
+
+    def test_r_with_fsname_and_path_in_cap(self):
+        perm = 'r'
+        filepaths, filedata, mounts = self.setup_test_env(perm, True, 'dir1')
+
+        self.run_mds_cap_tests(filepaths, filedata, mounts, perm)
+
+    # XXX: this tests the backward compatibility; "allow rw path=<dir1>" is
+    # treated as "allow rw fsname=* path=<dir1>"
+    def test_rw_with_no_fsname_and_path_in_cap(self):
+        perm = 'rw'
+        filepaths, filedata, mounts = self.setup_test_env(perm, False, 'dir1')
+
+        self.run_mds_cap_tests(filepaths, filedata, mounts, perm)
+
+    # XXX: this tests the backward compatibility; "allow r path=<dir1>" is
+    # treated as "allow r fsname=* path=<dir1>"
+    def test_r_with_no_fsname_and_path_in_cap(self):
+        perm = 'r'
+        filepaths, filedata, mounts = self.setup_test_env(perm, False, 'dir1')
+
+        self.run_mds_cap_tests(filepaths, filedata, mounts, perm)
+
+    def test_rw_with_no_fsname_and_no_path(self):
+        perm = 'rw'
+        filepaths, filedata, mounts = self.setup_test_env(perm)
+
+        self.run_mds_cap_tests(filepaths, filedata, mounts, perm)
+
+    def test_r_with_no_fsname_and_no_path(self):
+        perm = 'r'
+        filepaths, filedata, mounts = self.setup_test_env(perm)
+
+        self.run_mds_cap_tests(filepaths, filedata, mounts, perm)
+
+    def tearDown(self):
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        super(type(self), self).tearDown()
+
+    def setup_test_env(self, perm, fsname=False, cephfs_mntpt='/'):
+        """
+        Creates the cap string, files on both the FSs and then creates the
+        new client with the cap and remounts both the FSs with newly created
+        client.
+        """
+        filenames = ('file_on_fs1', 'file_on_fs2')
+        filedata = ('some data on first fs', 'some data on second fs')
+        mounts = (self.mount_a, self.mount_b)
+        self.setup_fs_contents(cephfs_mntpt, filenames, filedata)
+
+        keyring_paths = self.create_client_and_keyring_file(perm, fsname,
+                                                            cephfs_mntpt)
+        filepaths = self.remount_with_new_client(cephfs_mntpt, filenames,
+                                                 keyring_paths)
+
+        return filepaths, filedata, mounts
+
+    def generate_caps(self, perm, fsname, cephfs_mntpt):
+        moncap = 'allow r'
+        osdcap = (f'allow {perm} tag cephfs data={self.fs1.name}, '
+                  f'allow {perm} tag cephfs data={self.fs2.name}')
+
+        if fsname:
+            if cephfs_mntpt == '/':
+                mdscap = (f'allow {perm} fsname={self.fs1.name}, '
+                          f'allow {perm} fsname={self.fs2.name}')
+            else:
+                mdscap = (f'allow {perm} fsname={self.fs1.name} '
+                          f'path=/{cephfs_mntpt}, '
+                          f'allow {perm} fsname={self.fs2.name} '
+                          f'path=/{cephfs_mntpt}')
+        else:
+            if cephfs_mntpt == '/':
+                mdscap = f'allow {perm}'
+            else:
+                mdscap = f'allow {perm} path=/{cephfs_mntpt}'
+
+        return moncap, osdcap, mdscap
+
+    def create_client_and_keyring_file(self, perm, fsname, cephfs_mntpt):
+        moncap, osdcap, mdscap = self.generate_caps(perm, fsname,
+                                                    cephfs_mntpt)
+
+        keyring = self.create_client(self.client_id, moncap, osdcap, mdscap)
+        keyring_paths = []
+        for mount_x in (self.mount_a, self.mount_b):
+            keyring_paths.append(self.create_keyring_file(
+                mount_x.client_remote, keyring))
+
+        return keyring_paths
+
+    def setup_fs_contents(self, cephfs_mntpt, filenames, filedata):
+        filepaths = []
+        iter_on = zip((self.mount_a, self.mount_b), filenames, filedata)
+
+        for mount_x, filename, data in iter_on:
+            if cephfs_mntpt != '/' :
+                mount_x.run_shell(args=['mkdir', cephfs_mntpt])
+                filepaths.append(os_path_join(mount_x.hostfs_mntpt,
+                                              cephfs_mntpt, filename))
+            else:
+                filepaths.append(os_path_join(mount_x.hostfs_mntpt, filename))
+
+            mount_x.write_file(filepaths[-1], data)
+
+    def remount_with_new_client(self, cephfs_mntpt, filenames,
+                                           keyring_paths):
+        if isinstance(cephfs_mntpt, str) and cephfs_mntpt != '/' :
+            cephfs_mntpt = '/' + cephfs_mntpt
+
+        self.mount_a.remount(client_id=self.client_id,
+                             client_keyring_path=keyring_paths[0],
+                             client_remote=self.mount_a.client_remote,
+                             cephfs_name=self.fs1.name,
+                             cephfs_mntpt=cephfs_mntpt,
+                             hostfs_mntpt=self.mount_a.hostfs_mntpt,
+                             wait=True)
+        self.mount_b.remount(client_id=self.client_id,
+                             client_keyring_path=keyring_paths[1],
+                             client_remote=self.mount_b.client_remote,
+                             cephfs_name=self.fs2.name,
+                             cephfs_mntpt=cephfs_mntpt,
+                             hostfs_mntpt=self.mount_b.hostfs_mntpt,
+                             wait=True)
+
+        return (os_path_join(self.mount_a.hostfs_mntpt, filenames[0]),
+                os_path_join(self.mount_b.hostfs_mntpt, filenames[1]))
+
+
+class TestClientsWithoutAuth(TestMultiFS):
+
+    def setUp(self):
+        super(TestClientsWithoutAuth, self).setUp()
+
+        # TODO: When MON and OSD caps for a Ceph FS are assigned to a
+        # client but MDS caps are not, mount.ceph prints "permission
+        # denied". But when MON caps are not assigned and MDS and OSD
+        # caps are, mount.ceph prints "no mds server or cluster laggy"
+        # instead of "permission denied".
+        #
+        # Before uncommenting the following line a fix would be required
+        # for latter case to change "no mds server is up or the cluster is
+        #  laggy" to "permission denied".
+        self.kernel_errmsgs = ('permission denied', 'no mds server is up or '
+                               'the cluster is laggy', 'no such file or '
+                               'directory',
+                               'input/output error')
+
+        # TODO: When MON and OSD caps are assigned for a Ceph FS to a
+        # client but MDS caps are not, ceph-fuse prints "operation not
+        # permitted". But when MON caps are not assigned and MDS and OSD
+        # caps are, ceph-fuse prints "no such file or directory" instead
+        # of "operation not permitted".
+        #
+        # Before uncommenting the following line a fix would be required
+        # for the latter case to change "no such file or directory" to
+        # "operation not permitted".
+        #self.assertIn('operation not permitted', retval[2].lower())
+        self.fuse_errmsgs = ('operation not permitted', 'no such file or '
+                             'directory')
+
+        if 'kernel' in str(type(self.mount_a)).lower():
+            self.errmsgs = self.kernel_errmsgs
+        elif 'fuse' in str(type(self.mount_a)).lower():
+            self.errmsgs = self.fuse_errmsgs
+        else:
+            raise RuntimeError('strange, the client was neither based on '
+                               'kernel nor FUSE.')
+
+    def check_that_mount_failed_for_right_reason(self, stderr):
+        stderr = stderr.lower()
+        for errmsg in self.errmsgs:
+            if errmsg in stderr:
+                break
+        else:
+            raise AssertionError('can\'t find expected set of words in the '
+                                 f'stderr\nself.errmsgs - {self.errmsgs}\n'
+                                 f'stderr - {stderr}')
+
+    def test_mount_all_caps_absent(self):
+        # setup part...
+        keyring = self.fs1.authorize(self.client_id, ('/', 'rw'))
+        keyring_path = self.create_keyring_file(self.mount_a.client_remote,
+                                                keyring)
+
+        # mount the FS for which client has no auth...
+        retval = self.mount_a.remount(client_id=self.client_id,
+                                      client_keyring_path=keyring_path,
+                                      cephfs_name=self.fs2.name,
+                                      check_status=False)
+
+        # tests...
+        self.assertIsInstance(retval, tuple)
+        self.assertEqual(len(retval), 3)
+        self.assertIsInstance(retval[0], CommandFailedError)
+        self.check_that_mount_failed_for_right_reason(retval[2])
+
+    def test_mount_mon_and_osd_caps_present_mds_caps_absent(self):
+        # setup part...
+        moncap = f'allow rw fsname={self.fs1.name}, allow rw fsname={self.fs2.name}'
+        mdscap = f'allow rw fsname={self.fs1.name}'
+        osdcap = (f'allow rw tag cephfs data={self.fs1.name}, allow rw tag '
+                  f'cephfs data={self.fs2.name}')
+        keyring = self.create_client(self.client_id, moncap, osdcap, mdscap)
+        keyring_path = self.create_keyring_file(self.mount_a.client_remote,
+                                                keyring)
+
+        # mount the FS for which client has no auth...
+        retval = self.mount_a.remount(client_id=self.client_id,
+                                      client_keyring_path=keyring_path,
+                                      cephfs_name=self.fs2.name,
+                                      check_status=False)
+
+        # tests...
+        self.assertIsInstance(retval, tuple)
+        self.assertEqual(len(retval), 3)
+        self.assertIsInstance(retval[0], CommandFailedError)
+        self.check_that_mount_failed_for_right_reason(retval[2])
diff --git a/qa/tasks/cephfs/test_multimds_misc.py b/qa/tasks/cephfs/test_multimds_misc.py
new file mode 100644
index 000000000..3c464e91d
--- /dev/null
+++ b/qa/tasks/cephfs/test_multimds_misc.py
@@ -0,0 +1,223 @@
+import logging
+import errno
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.contextutil import safe_while
+from teuthology.orchestra.run import CommandFailedError
+
+log = logging.getLogger(__name__)
+
+class TestScrub2(CephFSTestCase):
+    MDSS_REQUIRED = 3
+    CLIENTS_REQUIRED = 1
+
+    def _check_scrub_status(self, result=None, reverse=False):
+        self.assertEqual(self.fs.wait_until_scrub_complete(result=result, rank=1,
+                                                           sleep=5, timeout=30,
+                                                           reverse=reverse), True)
+        self.assertEqual(self.fs.wait_until_scrub_complete(result=result, rank=2,
+                                                           sleep=5, timeout=30,
+                                                           reverse=reverse), True)
+        self.assertEqual(self.fs.wait_until_scrub_complete(result=result, rank=0,
+                                                           sleep=5, timeout=30,
+                                                           reverse=reverse), True)
+
+    def _check_task_status_na(self, timo=120):
+        """ check absence of scrub status in ceph status """
+        with safe_while(sleep=1, tries=120, action='wait for task status') as proceed:
+            while proceed():
+                active = self.fs.get_active_names()
+                log.debug("current active={0}".format(active))
+                task_status = self.fs.get_task_status("scrub status")
+                if not active[0] in task_status:
+                    return True
+
+    def _check_task_status(self, expected_status, timo=120):
+        """ check scrub status for current active mds in ceph status """
+        with safe_while(sleep=1, tries=120, action='wait for task status') as proceed:
+            while proceed():
+                active = self.fs.get_active_names()
+                log.debug("current active={0}".format(active))
+                task_status = self.fs.get_task_status("scrub status")
+                try:
+                    if task_status[active[0]].startswith(expected_status):
+                        return True
+                except KeyError:
+                    pass
+
+    def _find_path_inos(self, root_path):
+        inos = []
+        p = self.mount_a.run_shell(["find", root_path])
+        paths = p.stdout.getvalue().strip().split()
+        for path in paths:
+            inos.append(self.mount_a.path_to_ino(path))
+        return inos
+
+    def _setup_subtrees(self):
+        self.fs.set_max_mds(3)
+        self.fs.wait_for_daemons()
+        status = self.fs.status()
+
+        path = 'd1/d2/d3/d4/d5/d6/d7/d8'
+        self.mount_a.run_shell(['mkdir', '-p', path])
+        self.mount_a.run_shell(['sync', path])
+
+        self.mount_a.setfattr("d1/d2", "ceph.dir.pin", "0")
+        self.mount_a.setfattr("d1/d2/d3/d4", "ceph.dir.pin", "1")
+        self.mount_a.setfattr("d1/d2/d3/d4/d5/d6", "ceph.dir.pin", "2")
+        
+        self._wait_subtrees([('/d1/d2', 0), ('/d1/d2/d3/d4', 1)], status, 0)
+        self._wait_subtrees([('/d1/d2/d3/d4', 1), ('/d1/d2/d3/d4/d5/d6', 2)], status, 1)
+        self._wait_subtrees([('/d1/d2/d3/d4', 1), ('/d1/d2/d3/d4/d5/d6', 2)], status, 2)
+
+        for rank in range(3):
+            self.fs.rank_tell(["flush", "journal"], rank)
+
+    def test_apply_tag(self):
+        self._setup_subtrees()
+        inos = self._find_path_inos('d1/d2/d3/')
+
+        tag = "tag123"
+        out_json = self.fs.rank_tell(["tag", "path", "/d1/d2/d3", tag], 0)
+        self.assertNotEqual(out_json, None)
+        self.assertEqual(out_json["return_code"], 0)
+        self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
+
+        def assertTagged(ino):
+            file_obj_name = "{0:x}.00000000".format(ino)
+            self.fs.radosm(["getxattr", file_obj_name, "scrub_tag"])
+
+        for ino in inos:
+            assertTagged(ino)
+
+    def test_scrub_backtrace(self):
+        self._setup_subtrees()
+        inos = self._find_path_inos('d1/d2/d3/')
+
+        for ino in inos:
+            file_obj_name = "{0:x}.00000000".format(ino)
+            self.fs.radosm(["rmxattr", file_obj_name, "parent"])
+
+        out_json = self.fs.run_scrub(["start", "/d1/d2/d3", "recursive,force"], 0)
+        self.assertNotEqual(out_json, None)
+        self.assertEqual(out_json["return_code"], 0)
+        self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
+
+        def _check_damage(mds_rank, inos):
+            all_damage = self.fs.rank_tell(["damage", "ls"], mds_rank)
+            damage = [d for d in all_damage if d['ino'] in inos and d['damage_type'] == "backtrace"]
+            return len(damage) >= len(inos)
+
+        self.assertTrue(_check_damage(0, inos[0:2]))
+        self.assertTrue(_check_damage(1, inos[2:4]))
+        self.assertTrue(_check_damage(2, inos[4:6]))
+
+    def test_scrub_non_mds0(self):
+        self._setup_subtrees()
+
+        def expect_exdev(cmd, mds):
+            try:
+                self.fs.mon_manager.raw_cluster_cmd('tell', 'mds.{0}'.format(mds), *cmd)
+            except CommandFailedError as e:
+                if e.exitstatus == errno.EXDEV:
+                    pass
+                else:
+                    raise
+            else:
+                raise RuntimeError("expected failure")
+
+        rank1 = self.fs.get_rank(rank=1)
+        expect_exdev(["scrub", "start", "/d1/d2/d3"], rank1["name"])
+        expect_exdev(["scrub", "abort"], rank1["name"])
+        expect_exdev(["scrub", "pause"], rank1["name"])
+        expect_exdev(["scrub", "resume"], rank1["name"])
+
+    def test_scrub_abort_mds0(self):
+        self._setup_subtrees()
+
+        inos = self._find_path_inos('d1/d2/d3/')
+
+        for ino in inos:
+            file_obj_name = "{0:x}.00000000".format(ino)
+            self.fs.radosm(["rmxattr", file_obj_name, "parent"])
+
+        out_json = self.fs.run_scrub(["start", "/d1/d2/d3", "recursive,force"], 0)
+        self.assertNotEqual(out_json, None)
+        
+        res = self.fs.run_scrub(["abort"])
+        self.assertEqual(res['return_code'], 0)
+
+        # Abort and verify in both mdss. We also check the status in rank 0 mds because
+        # it is supposed to gather the scrub status from other mdss.
+        self._check_scrub_status()
+
+        # sleep enough to fetch updated task status
+        checked = self._check_task_status_na()
+        self.assertTrue(checked)
+
+    def test_scrub_pause_and_resume_mds0(self):
+        self._setup_subtrees()
+
+        inos = self._find_path_inos('d1/d2/d3/')
+
+        for ino in inos:
+            file_obj_name = "{0:x}.00000000".format(ino)
+            self.fs.radosm(["rmxattr", file_obj_name, "parent"])
+
+        out_json = self.fs.run_scrub(["start", "/d1/d2/d3", "recursive,force"], 0)
+        self.assertNotEqual(out_json, None)
+
+        res = self.fs.run_scrub(["pause"])
+        self.assertEqual(res['return_code'], 0)
+
+        self._check_scrub_status(result="PAUSED")
+
+        checked = self._check_task_status("paused")
+        self.assertTrue(checked)
+
+        # resume and verify
+        res = self.fs.run_scrub(["resume"])
+        self.assertEqual(res['return_code'], 0)
+        
+        self._check_scrub_status(result="PAUSED", reverse=True)
+
+        checked = self._check_task_status_na()
+        self.assertTrue(checked)
+
+    def test_scrub_pause_and_resume_with_abort_mds0(self):
+        self._setup_subtrees()
+
+        inos = self._find_path_inos('d1/d2/d3/')
+
+        for ino in inos:
+            file_obj_name = "{0:x}.00000000".format(ino)
+            self.fs.radosm(["rmxattr", file_obj_name, "parent"])
+
+        out_json = self.fs.run_scrub(["start", "/d1/d2/d3", "recursive,force"], 0)
+        self.assertNotEqual(out_json, None)
+
+        res = self.fs.run_scrub(["pause"])
+        self.assertEqual(res['return_code'], 0)
+
+        self._check_scrub_status(result="PAUSED")
+
+        checked = self._check_task_status("paused")
+        self.assertTrue(checked)
+
+        res = self.fs.run_scrub(["abort"])
+        self.assertEqual(res['return_code'], 0)
+
+        self._check_scrub_status(result="PAUSED")
+        self._check_scrub_status(result="0 inodes")
+
+        # scrub status should still be paused...
+        checked = self._check_task_status("paused")
+        self.assertTrue(checked)
+
+        # resume and verify
+        res = self.fs.run_scrub(["resume"])
+        self.assertEqual(res['return_code'], 0)
+
+        self._check_scrub_status(result="PAUSED", reverse=True)
+
+        checked = self._check_task_status_na()
+        self.assertTrue(checked)
diff --git a/qa/tasks/cephfs/test_newops.py b/qa/tasks/cephfs/test_newops.py
new file mode 100644
index 000000000..4c34dabdd
--- /dev/null
+++ b/qa/tasks/cephfs/test_newops.py
@@ -0,0 +1,25 @@
+import logging
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.kernel_mount import KernelMount
+
+log = logging.getLogger(__name__)
+
+class TestNewOps(CephFSTestCase):
+    def test_newops_getvxattr(self):
+        """
+        For nautilus it will crash the MDSs when receive unknown OPs, as a workaround
+        the clients should avoid sending them to nautilus
+        """
+        if isinstance(self.mount_a, FuseMount):
+            log.info('client is fuse mounted')
+        elif isinstance(self.mount_a, KernelMount):
+            log.info('client is kernel mounted')
+            self.skipTest("Currently kclient hasn't fixed new ops issue yet.")
+
+        log.info("Test for new getvxattr op...")
+        self.mount_a.run_shell(["mkdir", "newop_getvxattr_dir"])
+
+        # to test whether will nautilus crash the MDSs
+        self.mount_a.getfattr("./newop_getvxattr_dir", "ceph.dir.pin.random")
+        log.info("Test for new getvxattr op succeeds")
diff --git a/qa/tasks/cephfs/test_nfs.py b/qa/tasks/cephfs/test_nfs.py
new file mode 100644
index 000000000..47b3e63a6
--- /dev/null
+++ b/qa/tasks/cephfs/test_nfs.py
@@ -0,0 +1,727 @@
+# NOTE: these tests are not yet compatible with vstart_runner.py.
+import errno
+import json
+import time
+import logging
+from io import BytesIO
+
+from tasks.mgr.mgr_test_case import MgrTestCase
+from teuthology import contextutil
+from teuthology.exceptions import CommandFailedError
+
+log = logging.getLogger(__name__)
+
+NFS_POOL_NAME = '.nfs'  # should match mgr_module.py
+
+# TODO Add test for cluster update when ganesha can be deployed on multiple ports.
+class TestNFS(MgrTestCase):
+    def _cmd(self, *args):
+        return self.mgr_cluster.mon_manager.raw_cluster_cmd(*args)
+
+    def _nfs_cmd(self, *args):
+        return self._cmd("nfs", *args)
+
+    def _orch_cmd(self, *args):
+        return self._cmd("orch", *args)
+
+    def _sys_cmd(self, cmd):
+        ret = self.ctx.cluster.run(args=cmd, check_status=False, stdout=BytesIO(), stderr=BytesIO())
+        stdout = ret[0].stdout
+        if stdout:
+            return stdout.getvalue()
+
+    def setUp(self):
+        super(TestNFS, self).setUp()
+        self._load_module('nfs')
+        self.cluster_id = "test"
+        self.export_type = "cephfs"
+        self.pseudo_path = "/cephfs"
+        self.path = "/"
+        self.fs_name = "nfs-cephfs"
+        self.expected_name = "nfs.test"
+        self.sample_export = {
+         "export_id": 1,
+         "path": self.path,
+         "cluster_id": self.cluster_id,
+         "pseudo": self.pseudo_path,
+         "access_type": "RW",
+         "squash": "none",
+         "security_label": True,
+         "protocols": [
+           4
+         ],
+         "transports": [
+           "TCP"
+         ],
+         "fsal": {
+           "name": "CEPH",
+           "user_id": "nfs.test.1",
+           "fs_name": self.fs_name,
+         },
+         "clients": []
+        }
+
+    def _check_nfs_server_status(self):
+        res = self._sys_cmd(['sudo', 'systemctl', 'status', 'nfs-server'])
+        if isinstance(res, bytes) and b'Active: active' in res:
+            self._disable_nfs()
+
+    def _disable_nfs(self):
+        log.info("Disabling NFS")
+        self._sys_cmd(['sudo', 'systemctl', 'disable', 'nfs-server', '--now'])
+
+    def _fetch_nfs_daemons_details(self, enable_json=False):
+        args = ('ps', f'--service_name={self.expected_name}')
+        if enable_json:
+            args = (*args, '--format=json')
+        return self._orch_cmd(*args)
+
+    def _check_nfs_cluster_event(self, expected_event):
+        '''
+        Check whether an event occured during the lifetime of the NFS service
+        :param expected_event: event that was expected to occur
+        '''
+        event_occurred = False
+        # Wait few seconds for NFS daemons' status to be updated
+        with contextutil.safe_while(sleep=10, tries=12, _raise=False) as proceed:
+            while not event_occurred and proceed():
+                daemons_details = json.loads(
+                    self._fetch_nfs_daemons_details(enable_json=True))
+                log.info('daemons details %s', daemons_details)
+                for event in daemons_details[0]['events']:
+                    log.info('daemon event %s', event)
+                    if expected_event in event:
+                        event_occurred = True
+                        break
+        return event_occurred
+
+    def _check_nfs_cluster_status(self, expected_status, fail_msg):
+        '''
+        Check the current status of the NFS service
+        :param expected_status: Status to be verified
+        :param fail_msg: Message to be printed if test failed
+        '''
+        # Wait for two minutes as ganesha daemon takes some time to be deleted/created
+        wait_time = 10
+        while wait_time <= 120:
+            time.sleep(wait_time)
+            if expected_status in self._fetch_nfs_daemons_details():
+                return
+            wait_time += 10
+        self.fail(fail_msg)
+
+    def _check_auth_ls(self, export_id=1, check_in=False):
+        '''
+        Tests export user id creation or deletion.
+        :param export_id: Denotes export number
+        :param check_in: Check specified export id
+        '''
+        output = self._cmd('auth', 'ls')
+        client_id = f'client.nfs.{self.cluster_id}'
+        if check_in:
+            self.assertIn(f'{client_id}.{export_id}', output)
+        else:
+            self.assertNotIn(f'{client_id}.{export_id}', output)
+
+    def _test_idempotency(self, cmd_func, cmd_args):
+        '''
+        Test idempotency of commands. It first runs the TestNFS test method
+        for a command and then checks the result of command run again. TestNFS
+        test method has required checks to verify that command works.
+        :param cmd_func: TestNFS method
+        :param cmd_args: nfs command arguments to be run
+        '''
+        cmd_func()
+        ret = self.mgr_cluster.mon_manager.raw_cluster_cmd_result(*cmd_args)
+        if ret != 0:
+            self.fail("Idempotency test failed")
+
+    def _test_create_cluster(self):
+        '''
+        Test single nfs cluster deployment.
+        '''
+        # Disable any running nfs ganesha daemon
+        self._check_nfs_server_status()
+        self._nfs_cmd('cluster', 'create', self.cluster_id)
+        # Check for expected status and daemon name (nfs.<cluster_id>)
+        self._check_nfs_cluster_status('running', 'NFS Ganesha cluster deployment failed')
+
+    def _test_delete_cluster(self):
+        '''
+        Test deletion of a single nfs cluster.
+        '''
+        self._nfs_cmd('cluster', 'rm', self.cluster_id)
+        self._check_nfs_cluster_status('No daemons reported',
+                                       'NFS Ganesha cluster could not be deleted')
+
+    def _test_list_cluster(self, empty=False):
+        '''
+        Test listing of deployed nfs clusters. If nfs cluster is deployed then
+        it checks for expected cluster id. Otherwise checks nothing is listed.
+        :param empty: If true it denotes no cluster is deployed.
+        '''
+        if empty:
+            cluster_id = ''
+        else:
+            cluster_id = self.cluster_id
+        nfs_output = self._nfs_cmd('cluster', 'ls')
+        self.assertEqual(cluster_id, nfs_output.strip())
+
+    def _create_export(self, export_id, create_fs=False, extra_cmd=None):
+        '''
+        Test creation of a single export.
+        :param export_id: Denotes export number
+        :param create_fs: If false filesytem exists. Otherwise create it.
+        :param extra_cmd: List of extra arguments for creating export.
+        '''
+        if create_fs:
+            self._cmd('fs', 'volume', 'create', self.fs_name)
+            with contextutil.safe_while(sleep=5, tries=30) as proceed:
+                while proceed():
+                    output = self._cmd(
+                        'orch', 'ls', '-f', 'json',
+                        '--service-name', f'mds.{self.fs_name}'
+                    )
+                    j = json.loads(output)
+                    if j[0]['status']['running']:
+                        break
+        export_cmd = ['nfs', 'export', 'create', 'cephfs',
+                      '--fsname', self.fs_name, '--cluster-id', self.cluster_id]
+        if isinstance(extra_cmd, list):
+            export_cmd.extend(extra_cmd)
+        else:
+            export_cmd.extend(['--pseudo-path', self.pseudo_path])
+        # Runs the nfs export create command
+        self._cmd(*export_cmd)
+        # Check if user id for export is created
+        self._check_auth_ls(export_id, check_in=True)
+        res = self._sys_cmd(['rados', '-p', NFS_POOL_NAME, '-N', self.cluster_id, 'get',
+                             f'export-{export_id}', '-'])
+        # Check if export object is created
+        if res == b'':
+            self.fail("Export cannot be created")
+
+    def _create_default_export(self):
+        '''
+        Deploy a single nfs cluster and create export with default options.
+        '''
+        self._test_create_cluster()
+        self._create_export(export_id='1', create_fs=True)
+
+    def _delete_export(self):
+        '''
+        Delete an export.
+        '''
+        self._nfs_cmd('export', 'rm', self.cluster_id, self.pseudo_path)
+        self._check_auth_ls()
+
+    def _test_list_export(self):
+        '''
+        Test listing of created exports.
+        '''
+        nfs_output = json.loads(self._nfs_cmd('export', 'ls', self.cluster_id))
+        self.assertIn(self.pseudo_path, nfs_output)
+
+    def _test_list_detailed(self, sub_vol_path):
+        '''
+        Test listing of created exports with detailed option.
+        :param sub_vol_path: Denotes path of subvolume
+        '''
+        nfs_output = json.loads(self._nfs_cmd('export', 'ls', self.cluster_id, '--detailed'))
+        # Export-1 with default values (access type = rw and path = '\')
+        self.assertDictEqual(self.sample_export, nfs_output[0])
+        # Export-2 with r only
+        self.sample_export['export_id'] = 2
+        self.sample_export['pseudo'] = self.pseudo_path + '1'
+        self.sample_export['access_type'] = 'RO'
+        self.sample_export['fsal']['user_id'] = f'{self.expected_name}.2'
+        self.assertDictEqual(self.sample_export, nfs_output[1])
+        # Export-3 for subvolume with r only
+        self.sample_export['export_id'] = 3
+        self.sample_export['path'] = sub_vol_path
+        self.sample_export['pseudo'] = self.pseudo_path + '2'
+        self.sample_export['fsal']['user_id'] = f'{self.expected_name}.3'
+        self.assertDictEqual(self.sample_export, nfs_output[2])
+        # Export-4 for subvolume
+        self.sample_export['export_id'] = 4
+        self.sample_export['pseudo'] = self.pseudo_path + '3'
+        self.sample_export['access_type'] = 'RW'
+        self.sample_export['fsal']['user_id'] = f'{self.expected_name}.4'
+        self.assertDictEqual(self.sample_export, nfs_output[3])
+
+    def _get_export(self):
+        '''
+        Returns export block in json format
+        '''
+        return json.loads(self._nfs_cmd('export', 'info', self.cluster_id, self.pseudo_path))
+
+    def _test_get_export(self):
+        '''
+        Test fetching of created export.
+        '''
+        nfs_output = self._get_export()
+        self.assertDictEqual(self.sample_export, nfs_output)
+
+    def _check_export_obj_deleted(self, conf_obj=False):
+        '''
+        Test if export or config object are deleted successfully.
+        :param conf_obj: It denotes config object needs to be checked
+        '''
+        rados_obj_ls = self._sys_cmd(['rados', '-p', NFS_POOL_NAME, '-N', self.cluster_id, 'ls'])
+
+        if b'export-' in rados_obj_ls or (conf_obj and b'conf-nfs' in rados_obj_ls):
+            self.fail("Delete export failed")
+
+    def _get_port_ip_info(self):
+        '''
+        Return port and ip for a cluster
+        '''
+        #{'test': {'backend': [{'hostname': 'smithi068', 'ip': '172.21.15.68', 'port': 2049}]}}
+        info_output = json.loads(self._nfs_cmd('cluster', 'info', self.cluster_id))['test']['backend'][0]
+        return info_output["port"], info_output["ip"]
+
+    def _test_mnt(self, pseudo_path, port, ip, check=True):
+        '''
+        Test mounting of created exports
+        :param pseudo_path: It is the pseudo root name
+        :param port: Port of deployed nfs cluster
+        :param ip: IP of deployed nfs cluster
+        :param check: It denotes if i/o testing needs to be done
+        '''
+        tries = 3
+        while True:
+            try:
+                self.ctx.cluster.run(
+                    args=['sudo', 'mount', '-t', 'nfs', '-o', f'port={port}',
+                          f'{ip}:{pseudo_path}', '/mnt'])
+                break
+            except CommandFailedError as e:
+                if tries:
+                    tries -= 1
+                    time.sleep(2)
+                    continue
+                # Check if mount failed only when non existing pseudo path is passed
+                if not check and e.exitstatus == 32:
+                    return
+                raise
+
+        self.ctx.cluster.run(args=['sudo', 'chmod', '1777', '/mnt'])
+
+        try:
+            self.ctx.cluster.run(args=['touch', '/mnt/test'])
+            out_mnt = self._sys_cmd(['ls', '/mnt'])
+            self.assertEqual(out_mnt,  b'test\n')
+        finally:
+            self.ctx.cluster.run(args=['sudo', 'umount', '/mnt'])
+
+    def _write_to_read_only_export(self, pseudo_path, port, ip):
+        '''
+        Check if write to read only export fails
+        '''
+        try:
+            self._test_mnt(pseudo_path, port, ip)
+        except CommandFailedError as e:
+            # Write to cephfs export should fail for test to pass
+            self.assertEqual(
+                e.exitstatus, errno.EPERM,
+                'invalid error code on trying to write to read-only export')
+        else:
+            self.fail('expected write to a read-only export to fail')
+
+    def test_create_and_delete_cluster(self):
+        '''
+        Test successful creation and deletion of the nfs cluster.
+        '''
+        self._test_create_cluster()
+        self._test_list_cluster()
+        self._test_delete_cluster()
+        # List clusters again to ensure no cluster is shown
+        self._test_list_cluster(empty=True)
+
+    def test_create_delete_cluster_idempotency(self):
+        '''
+        Test idempotency of cluster create and delete commands.
+        '''
+        self._test_idempotency(self._test_create_cluster, ['nfs', 'cluster', 'create', self.cluster_id])
+        self._test_idempotency(self._test_delete_cluster, ['nfs', 'cluster', 'rm', self.cluster_id])
+
+    def test_create_cluster_with_invalid_cluster_id(self):
+        '''
+        Test nfs cluster deployment failure with invalid cluster id.
+        '''
+        try:
+            invalid_cluster_id = '/cluster_test'  # Only [A-Za-z0-9-_.] chars are valid
+            self._nfs_cmd('cluster', 'create', invalid_cluster_id)
+            self.fail(f"Cluster successfully created with invalid cluster id {invalid_cluster_id}")
+        except CommandFailedError as e:
+            # Command should fail for test to pass
+            if e.exitstatus != errno.EINVAL:
+                raise
+
+    def test_create_and_delete_export(self):
+        '''
+        Test successful creation and deletion of the cephfs export.
+        '''
+        self._create_default_export()
+        self._test_get_export()
+        port, ip = self._get_port_ip_info()
+        self._test_mnt(self.pseudo_path, port, ip)
+        self._delete_export()
+        # Check if rados export object is deleted
+        self._check_export_obj_deleted()
+        self._test_mnt(self.pseudo_path, port, ip, False)
+        self._test_delete_cluster()
+
+    def test_create_delete_export_idempotency(self):
+        '''
+        Test idempotency of export create and delete commands.
+        '''
+        self._test_idempotency(self._create_default_export, [
+            'nfs', 'export', 'create', 'cephfs',
+            '--fsname', self.fs_name, '--cluster-id', self.cluster_id,
+            '--pseudo-path', self.pseudo_path])
+        self._test_idempotency(self._delete_export, ['nfs', 'export', 'rm', self.cluster_id,
+                                                     self.pseudo_path])
+        self._test_delete_cluster()
+
+    def test_create_multiple_exports(self):
+        '''
+        Test creating multiple exports with different access type and path.
+        '''
+        # Export-1 with default values (access type = rw and path = '\')
+        self._create_default_export()
+        # Export-2 with r only
+        self._create_export(export_id='2',
+                            extra_cmd=['--pseudo-path', self.pseudo_path+'1', '--readonly'])
+        # Export-3 for subvolume with r only
+        self._cmd('fs', 'subvolume', 'create', self.fs_name, 'sub_vol')
+        fs_path = self._cmd('fs', 'subvolume', 'getpath', self.fs_name, 'sub_vol').strip()
+        self._create_export(export_id='3',
+                            extra_cmd=['--pseudo-path', self.pseudo_path+'2', '--readonly',
+                                       '--path', fs_path])
+        # Export-4 for subvolume
+        self._create_export(export_id='4',
+                            extra_cmd=['--pseudo-path', self.pseudo_path+'3',
+                                       '--path', fs_path])
+        # Check if exports gets listed
+        self._test_list_detailed(fs_path)
+        self._test_delete_cluster()
+        # Check if rados ganesha conf object is deleted
+        self._check_export_obj_deleted(conf_obj=True)
+        self._check_auth_ls()
+
+    def test_exports_on_mgr_restart(self):
+        '''
+        Test export availability on restarting mgr.
+        '''
+        self._create_default_export()
+        # unload and load module will restart the mgr
+        self._unload_module("cephadm")
+        self._load_module("cephadm")
+        self._orch_cmd("set", "backend", "cephadm")
+        # Check if ganesha daemon is running
+        self._check_nfs_cluster_status('running', 'Failed to redeploy NFS Ganesha cluster')
+        # Checks if created export is listed
+        self._test_list_export()
+        port, ip = self._get_port_ip_info()
+        self._test_mnt(self.pseudo_path, port, ip)
+        self._delete_export()
+        self._test_delete_cluster()
+
+    def test_export_create_with_non_existing_fsname(self):
+        '''
+        Test creating export with non-existing filesystem.
+        '''
+        try:
+            fs_name = 'nfs-test'
+            self._test_create_cluster()
+            self._nfs_cmd('export', 'create', 'cephfs',
+                          '--fsname', fs_name, '--cluster-id', self.cluster_id,
+                          '--pseudo-path', self.pseudo_path)
+            self.fail(f"Export created with non-existing filesystem {fs_name}")
+        except CommandFailedError as e:
+            # Command should fail for test to pass
+            if e.exitstatus != errno.ENOENT:
+                raise
+        finally:
+            self._test_delete_cluster()
+
+    def test_export_create_with_non_existing_clusterid(self):
+        '''
+        Test creating cephfs export with non-existing nfs cluster.
+        '''
+        try:
+            cluster_id = 'invalidtest'
+            self._nfs_cmd('export', 'create', 'cephfs', '--fsname', self.fs_name,
+                          '--cluster-id', cluster_id, '--pseudo-path', self.pseudo_path)
+            self.fail(f"Export created with non-existing cluster id {cluster_id}")
+        except CommandFailedError as e:
+            # Command should fail for test to pass
+            if e.exitstatus != errno.ENOENT:
+                raise
+
+    def test_export_create_with_relative_pseudo_path_and_root_directory(self):
+        '''
+        Test creating cephfs export with relative or '/' pseudo path.
+        '''
+        def check_pseudo_path(pseudo_path):
+            try:
+                self._nfs_cmd('export', 'create', 'cephfs', '--fsname', self.fs_name,
+                              '--cluster-id', self.cluster_id,
+                              '--pseudo-path', pseudo_path)
+                self.fail(f"Export created for {pseudo_path}")
+            except CommandFailedError as e:
+                # Command should fail for test to pass
+                if e.exitstatus != errno.EINVAL:
+                    raise
+
+        self._test_create_cluster()
+        self._cmd('fs', 'volume', 'create', self.fs_name)
+        check_pseudo_path('invalidpath')
+        check_pseudo_path('/')
+        check_pseudo_path('//')
+        self._cmd('fs', 'volume', 'rm', self.fs_name, '--yes-i-really-mean-it')
+        self._test_delete_cluster()
+
+    def test_write_to_read_only_export(self):
+        '''
+        Test write to readonly export.
+        '''
+        self._test_create_cluster()
+        self._create_export(export_id='1', create_fs=True,
+                            extra_cmd=['--pseudo-path', self.pseudo_path, '--readonly'])
+        port, ip = self._get_port_ip_info()
+        self._check_nfs_cluster_status('running', 'NFS Ganesha cluster restart failed')
+        self._write_to_read_only_export(self.pseudo_path, port, ip)
+        self._test_delete_cluster()
+
+    def test_cluster_info(self):
+        '''
+        Test cluster info outputs correct ip and hostname
+        '''
+        self._test_create_cluster()
+        info_output = json.loads(self._nfs_cmd('cluster', 'info', self.cluster_id))
+        print(f'info {info_output}')
+        info_ip = info_output[self.cluster_id].get('backend', [])[0].pop("ip")
+        host_details = {
+            self.cluster_id: {
+                'backend': [
+                    {
+                        "hostname": self._sys_cmd(['hostname']).decode("utf-8").strip(),
+                        "port": 2049
+                    }
+                ],
+                "virtual_ip": None,
+            }
+        }
+        host_ip = self._sys_cmd(['hostname', '-I']).decode("utf-8").split()
+        print(f'host_ip is {host_ip}, info_ip is {info_ip}')
+        self.assertDictEqual(info_output, host_details)
+        self.assertTrue(info_ip in host_ip)
+        self._test_delete_cluster()
+
+    def test_cluster_set_reset_user_config(self):
+        '''
+        Test cluster is created using user config and reverts back to default
+        config on reset.
+        '''
+        self._test_create_cluster()
+
+        pool = NFS_POOL_NAME
+        user_id = 'test'
+        fs_name = 'user_test_fs'
+        pseudo_path = '/ceph'
+        self._cmd('fs', 'volume', 'create', fs_name)
+        time.sleep(20)
+        key = self._cmd('auth', 'get-or-create-key', f'client.{user_id}', 'mon',
+            'allow r', 'osd',
+            f'allow rw pool={pool} namespace={self.cluster_id}, allow rw tag cephfs data={fs_name}',
+            'mds', f'allow rw path={self.path}').strip()
+        config = f""" LOG {{
+        Default_log_level = FULL_DEBUG;
+        }}
+
+        EXPORT {{
+	        Export_Id = 100;
+	        Transports = TCP;
+	        Path = /;
+	        Pseudo = {pseudo_path};
+	        Protocols = 4;
+	        Access_Type = RW;
+	        Attr_Expiration_Time = 0;
+	        Squash = None;
+	        FSAL {{
+	              Name = CEPH;
+                      Filesystem = {fs_name};
+                      User_Id = {user_id};
+                      Secret_Access_Key = '{key}';
+	        }}
+        }}"""
+        port, ip = self._get_port_ip_info()
+        self.ctx.cluster.run(args=['ceph', 'nfs', 'cluster', 'config',
+            'set', self.cluster_id, '-i', '-'], stdin=config)
+        time.sleep(30)
+        res = self._sys_cmd(['rados', '-p', pool, '-N', self.cluster_id, 'get',
+                             f'userconf-nfs.{user_id}', '-'])
+        self.assertEqual(config, res.decode('utf-8'))
+        self._test_mnt(pseudo_path, port, ip)
+        self._nfs_cmd('cluster', 'config', 'reset', self.cluster_id)
+        rados_obj_ls = self._sys_cmd(['rados', '-p', NFS_POOL_NAME, '-N', self.cluster_id, 'ls'])
+        if b'conf-nfs' not in rados_obj_ls and b'userconf-nfs' in rados_obj_ls:
+            self.fail("User config not deleted")
+        time.sleep(30)
+        self._test_mnt(pseudo_path, port, ip, False)
+        self._cmd('fs', 'volume', 'rm', fs_name, '--yes-i-really-mean-it')
+        self._test_delete_cluster()
+
+    def test_cluster_set_user_config_with_non_existing_clusterid(self):
+        '''
+        Test setting user config for non-existing nfs cluster.
+        '''
+        try:
+            cluster_id = 'invalidtest'
+            self.ctx.cluster.run(args=['ceph', 'nfs', 'cluster',
+                'config', 'set', self.cluster_id, '-i', '-'], stdin='testing')
+            self.fail(f"User config set for non-existing cluster {cluster_id}")
+        except CommandFailedError as e:
+            # Command should fail for test to pass
+            if e.exitstatus != errno.ENOENT:
+                raise
+
+    def test_cluster_reset_user_config_with_non_existing_clusterid(self):
+        '''
+        Test resetting user config for non-existing nfs cluster.
+        '''
+        try:
+            cluster_id = 'invalidtest'
+            self._nfs_cmd('cluster', 'config', 'reset', cluster_id)
+            self.fail(f"User config reset for non-existing cluster {cluster_id}")
+        except CommandFailedError as e:
+            # Command should fail for test to pass
+            if e.exitstatus != errno.ENOENT:
+                raise
+
+    def test_create_export_via_apply(self):
+        '''
+        Test creation of export via apply
+        '''
+        self._test_create_cluster()
+        self.ctx.cluster.run(args=['ceph', 'nfs', 'export', 'apply',
+                                   self.cluster_id, '-i', '-'],
+                             stdin=json.dumps({
+                                 "path": "/",
+                                 "pseudo": "/cephfs",
+                                 "squash": "none",
+                                 "access_type": "rw",
+                                 "protocols": [4],
+                                 "fsal": {
+                                     "name": "CEPH",
+                                     "fs_name": self.fs_name
+                                 }
+                             }))
+        port, ip = self._get_port_ip_info()
+        self._test_mnt(self.pseudo_path, port, ip)
+        self._check_nfs_cluster_status(
+            'running', 'NFS Ganesha cluster not running after new export was applied')
+        self._test_delete_cluster()
+
+    def test_update_export(self):
+        '''
+        Test update of export's pseudo path and access type from rw to ro
+        '''
+        self._create_default_export()
+        port, ip = self._get_port_ip_info()
+        self._test_mnt(self.pseudo_path, port, ip)
+        export_block = self._get_export()
+        new_pseudo_path = '/testing'
+        export_block['pseudo'] = new_pseudo_path
+        export_block['access_type'] = 'RO'
+        self.ctx.cluster.run(args=['ceph', 'nfs', 'export', 'apply',
+                                   self.cluster_id, '-i', '-'],
+                             stdin=json.dumps(export_block))
+        if not self._check_nfs_cluster_event('restart'):
+            self.fail("updating export's pseudo path should trigger restart of NFS service")
+        self._check_nfs_cluster_status('running', 'NFS Ganesha cluster not running after restart')
+        self._write_to_read_only_export(new_pseudo_path, port, ip)
+        self._test_delete_cluster()
+
+    def test_update_export_ro_to_rw(self):
+        '''
+        Test update of export's access level from ro to rw
+        '''
+        self._test_create_cluster()
+        self._create_export(
+            export_id='1', create_fs=True,
+            extra_cmd=['--pseudo-path', self.pseudo_path, '--readonly'])
+        port, ip = self._get_port_ip_info()
+        self._write_to_read_only_export(self.pseudo_path, port, ip)
+        export_block = self._get_export()
+        export_block['access_type'] = 'RW'
+        self.ctx.cluster.run(
+            args=['ceph', 'nfs', 'export', 'apply', self.cluster_id, '-i', '-'],
+            stdin=json.dumps(export_block))
+        if self._check_nfs_cluster_event('restart'):
+            self.fail("update of export's access type should not trigger NFS service restart")
+        self._test_mnt(self.pseudo_path, port, ip)
+        self._test_delete_cluster()
+
+    def test_update_export_with_invalid_values(self):
+        '''
+        Test update of export with invalid values
+        '''
+        self._create_default_export()
+        export_block = self._get_export()
+
+        def update_with_invalid_values(key, value, fsal=False):
+            export_block_new = dict(export_block)
+            if fsal:
+                export_block_new['fsal'] = dict(export_block['fsal'])
+                export_block_new['fsal'][key] = value
+            else:
+                export_block_new[key] = value
+            try:
+                self.ctx.cluster.run(args=['ceph', 'nfs', 'export', 'apply',
+                                           self.cluster_id, '-i', '-'],
+                        stdin=json.dumps(export_block_new))
+            except CommandFailedError:
+                pass
+
+        update_with_invalid_values('export_id', 9)
+        update_with_invalid_values('cluster_id', 'testing_new')
+        update_with_invalid_values('pseudo', 'test_relpath')
+        update_with_invalid_values('access_type', 'W')
+        update_with_invalid_values('squash', 'no_squash')
+        update_with_invalid_values('security_label', 'invalid')
+        update_with_invalid_values('protocols', [2])
+        update_with_invalid_values('transports', ['UD'])
+        update_with_invalid_values('name', 'RGW', True)
+        update_with_invalid_values('user_id', 'testing_export', True)
+        update_with_invalid_values('fs_name', 'b', True)
+        self._test_delete_cluster()
+
+    def test_cmds_without_reqd_args(self):
+        '''
+        Test that cmd fails on not passing required arguments
+        '''
+        def exec_cmd_invalid(*cmd):
+            try:
+                self._nfs_cmd(*cmd)
+                self.fail(f"nfs {cmd} command executed successfully without required arguments")
+            except CommandFailedError as e:
+                # Command should fail for test to pass
+                if e.exitstatus != errno.EINVAL:
+                    raise
+
+        exec_cmd_invalid('cluster', 'create')
+        exec_cmd_invalid('cluster', 'delete')
+        exec_cmd_invalid('cluster', 'config', 'set')
+        exec_cmd_invalid('cluster', 'config', 'reset')
+        exec_cmd_invalid('export', 'create', 'cephfs')
+        exec_cmd_invalid('export', 'create', 'cephfs', 'clusterid')
+        exec_cmd_invalid('export', 'create', 'cephfs', 'clusterid', 'a_fs')
+        exec_cmd_invalid('export', 'ls')
+        exec_cmd_invalid('export', 'delete')
+        exec_cmd_invalid('export', 'delete', 'clusterid')
+        exec_cmd_invalid('export', 'info')
+        exec_cmd_invalid('export', 'info', 'clusterid')
+        exec_cmd_invalid('export', 'apply')
diff --git a/qa/tasks/cephfs/test_openfiletable.py b/qa/tasks/cephfs/test_openfiletable.py
new file mode 100644
index 000000000..eff6b5093
--- /dev/null
+++ b/qa/tasks/cephfs/test_openfiletable.py
@@ -0,0 +1,85 @@
+import time
+import logging
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+class OpenFileTable(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 1
+
+    def _check_oft_counter(self, name, count):
+        perf_dump = self.fs.mds_asok(['perf', 'dump'])
+        if perf_dump['oft'][name] == count:
+            return True
+        return False
+
+    def test_max_items_per_obj(self):
+        """
+        The maximum number of openfiles omap objects keys are now equal to
+        osd_deep_scrub_large_omap_object_key_threshold option.
+        """
+        self.set_conf("mds", "osd_deep_scrub_large_omap_object_key_threshold", "5")
+
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+
+        # Write some bytes to a file
+        size_mb = 1
+
+        # Hold the file open
+        file_count = 8
+        for i in range(0, file_count):
+            filename = "open_file{}".format(i)
+            p = self.mount_a.open_background(filename)
+            self.mount_a.write_n_mb(filename, size_mb)
+
+        time.sleep(10)
+
+        """
+        With osd_deep_scrub_large_omap_object_key_threshold value as 5 and
+        opening 8 files we should have a new rados object with name
+        mds0_openfiles.1 to hold the extra keys.
+        """
+
+        self.fs.radosm(["stat", "mds0_openfiles.1"])
+
+        # Now close the file
+        self.mount_a.kill_background(p)
+
+    def test_perf_counters(self):
+        """
+        Opening a file should increment omap_total_updates by 1.
+        """
+
+        self.set_conf("mds", "osd_deep_scrub_large_omap_object_key_threshold", "1")
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+
+        perf_dump = self.fs.mds_asok(['perf', 'dump'])
+        omap_total_updates_0 = perf_dump['oft']['omap_total_updates']
+        log.info("omap_total_updates_0:{}".format(omap_total_updates_0))
+        
+        # Open the file
+        p = self.mount_a.open_background("omap_counter_test_file")
+        self.wait_until_true(lambda: self._check_oft_counter('omap_total_updates', 2), timeout=120)
+        
+        perf_dump = self.fs.mds_asok(['perf', 'dump'])
+        omap_total_updates_1 = perf_dump['oft']['omap_total_updates']
+        log.info("omap_total_updates_1:{}".format(omap_total_updates_1))
+        
+        self.assertTrue((omap_total_updates_1 - omap_total_updates_0) == 2)
+        
+        # Now close the file
+        self.mount_a.kill_background(p)
+        # Ensure that the file does not exist any more
+        self.wait_until_true(lambda: self._check_oft_counter('omap_total_removes', 1), timeout=120)
+        self.wait_until_true(lambda: self._check_oft_counter('omap_total_kv_pairs', 1), timeout=120)
+
+        perf_dump = self.fs.mds_asok(['perf', 'dump'])
+        omap_total_removes = perf_dump['oft']['omap_total_removes']
+        omap_total_kv_pairs = perf_dump['oft']['omap_total_kv_pairs']
+        log.info("omap_total_removes:{}".format(omap_total_removes))
+        log.info("omap_total_kv_pairs:{}".format(omap_total_kv_pairs))
+        self.assertTrue(omap_total_removes == 1)
+        self.assertTrue(omap_total_kv_pairs == 1)
diff --git a/qa/tasks/cephfs/test_pool_perm.py b/qa/tasks/cephfs/test_pool_perm.py
new file mode 100644
index 000000000..9912debed
--- /dev/null
+++ b/qa/tasks/cephfs/test_pool_perm.py
@@ -0,0 +1,109 @@
+from textwrap import dedent
+from teuthology.exceptions import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+import os
+
+
+class TestPoolPerm(CephFSTestCase):
+    def test_pool_perm(self):
+        self.mount_a.run_shell(["touch", "test_file"])
+
+        file_path = os.path.join(self.mount_a.mountpoint, "test_file")
+
+        remote_script = dedent("""
+            import os
+            import errno
+
+            fd = os.open("{path}", os.O_RDWR)
+            try:
+                if {check_read}:
+                    ret = os.read(fd, 1024)
+                else:
+                    os.write(fd, b'content')
+            except OSError as e:
+                if e.errno != errno.EPERM:
+                    raise
+            else:
+                raise RuntimeError("client does not check permission of data pool")
+            """)
+
+        client_name = "client.{0}".format(self.mount_a.client_id)
+
+        # set data pool read only
+        self.fs.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', client_name, 'mds', 'allow', 'mon', 'allow r', 'osd',
+            'allow r pool={0}'.format(self.fs.get_data_pool_name()))
+
+        self.mount_a.umount_wait()
+        self.mount_a.mount_wait()
+
+        # write should fail
+        self.mount_a.run_python(remote_script.format(path=file_path, check_read=str(False)))
+
+        # set data pool write only
+        self.fs.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', client_name, 'mds', 'allow', 'mon', 'allow r', 'osd',
+            'allow w pool={0}'.format(self.fs.get_data_pool_name()))
+
+        self.mount_a.umount_wait()
+        self.mount_a.mount_wait()
+
+        # read should fail
+        self.mount_a.run_python(remote_script.format(path=file_path, check_read=str(True)))
+
+    def test_forbidden_modification(self):
+        """
+        That a client who does not have the capability for setting
+        layout pools is prevented from doing so.
+        """
+
+        # Set up
+        client_name = "client.{0}".format(self.mount_a.client_id)
+        new_pool_name = "data_new"
+        self.fs.add_data_pool(new_pool_name)
+
+        self.mount_a.run_shell(["touch", "layoutfile"])
+        self.mount_a.run_shell(["mkdir", "layoutdir"])
+
+        # Set MDS 'rw' perms: missing 'p' means no setting pool layouts
+        self.fs.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', client_name, 'mds', 'allow rw', 'mon', 'allow r',
+            'osd',
+            'allow rw pool={0},allow rw pool={1}'.format(
+                self.fs.get_data_pool_names()[0],
+                self.fs.get_data_pool_names()[1],
+            ))
+
+        self.mount_a.umount_wait()
+        self.mount_a.mount_wait()
+
+        with self.assertRaises(CommandFailedError):
+            self.mount_a.setfattr("layoutfile", "ceph.file.layout.pool",
+                                  new_pool_name)
+        with self.assertRaises(CommandFailedError):
+            self.mount_a.setfattr("layoutdir", "ceph.dir.layout.pool",
+                                  new_pool_name)
+        self.mount_a.umount_wait()
+
+        # Set MDS 'rwp' perms: should now be able to set layouts
+        self.fs.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', client_name, 'mds', 'allow rwp', 'mon', 'allow r',
+            'osd',
+            'allow rw pool={0},allow rw pool={1}'.format(
+                self.fs.get_data_pool_names()[0],
+                self.fs.get_data_pool_names()[1],
+            ))
+        self.mount_a.mount_wait()
+        self.mount_a.setfattr("layoutfile", "ceph.file.layout.pool",
+                              new_pool_name)
+        self.mount_a.setfattr("layoutdir", "ceph.dir.layout.pool",
+                              new_pool_name)
+        self.mount_a.umount_wait()
+
+    def tearDown(self):
+        self.fs.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', "client.{0}".format(self.mount_a.client_id),
+            'mds', 'allow', 'mon', 'allow r', 'osd',
+            'allow rw pool={0}'.format(self.fs.get_data_pool_names()[0]))
+        super(TestPoolPerm, self).tearDown()
+
diff --git a/qa/tasks/cephfs/test_quota.py b/qa/tasks/cephfs/test_quota.py
new file mode 100644
index 000000000..0386672bd
--- /dev/null
+++ b/qa/tasks/cephfs/test_quota.py
@@ -0,0 +1,106 @@
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+from teuthology.exceptions import CommandFailedError
+
+class TestQuota(CephFSTestCase):
+    CLIENTS_REQUIRED = 2
+    MDSS_REQUIRED = 1
+
+    def test_remote_update_getfattr(self):
+        """
+        That quota changes made from one client are visible to another
+        client looking at ceph.quota xattrs
+        """
+        self.mount_a.run_shell(["mkdir", "subdir"])
+
+        self.assertEqual(
+            self.mount_a.getfattr("./subdir", "ceph.quota.max_files"),
+            None)
+        self.assertEqual(
+            self.mount_b.getfattr("./subdir", "ceph.quota.max_files"),
+            None)
+
+        self.mount_a.setfattr("./subdir", "ceph.quota.max_files", "10")
+        self.assertEqual(
+            self.mount_a.getfattr("./subdir", "ceph.quota.max_files"),
+            "10")
+
+        # Should be visible as soon as setxattr operation completes on
+        # mds (we get here sooner because setfattr gets an early reply)
+        self.wait_until_equal(
+            lambda: self.mount_b.getfattr("./subdir", "ceph.quota.max_files"),
+            "10", timeout=10)
+
+    def test_remote_update_df(self):
+        """
+        That when a client modifies the quota on a directory used
+        as another client's root, the other client sees the change
+        reflected in their statfs output.
+        """
+
+        self.mount_b.umount_wait()
+
+        self.mount_a.run_shell(["mkdir", "subdir"])
+
+        size_before = 1024 * 1024 * 128
+        self.mount_a.setfattr("./subdir", "ceph.quota.max_bytes",
+                              "%s" % size_before)
+
+        self.mount_b.mount_wait(cephfs_mntpt="/subdir")
+
+        self.assertDictEqual(
+            self.mount_b.df(),
+            {
+                "total": size_before,
+                "used": 0,
+                "available": size_before
+            })
+
+        size_after = 1024 * 1024 * 256
+        self.mount_a.setfattr("./subdir", "ceph.quota.max_bytes",
+                              "%s" % size_after)
+
+        # Should be visible as soon as setxattr operation completes on
+        # mds (we get here sooner because setfattr gets an early reply)
+        self.wait_until_equal(
+            lambda: self.mount_b.df(),
+            {
+                "total": size_after,
+                "used": 0,
+                "available": size_after
+            },
+            timeout=10
+        )
+
+    def test_remote_update_write(self):
+        """
+        That when a client modifies the quota on a directory used
+        as another client's root, the other client sees the effect
+        of the change when writing data.
+        """
+
+        self.mount_a.run_shell(["mkdir", "subdir_files"])
+        self.mount_a.run_shell(["mkdir", "subdir_data"])
+
+        # Set some nice high quotas that mount_b's initial operations
+        # will be well within
+        self.mount_a.setfattr("./subdir_files", "ceph.quota.max_files", "100")
+        self.mount_a.setfattr("./subdir_data", "ceph.quota.max_bytes", "104857600")
+
+        # Do some writes within my quota
+        self.mount_b.create_n_files("subdir_files/file", 20)
+        self.mount_b.write_n_mb("subdir_data/file", 20)
+
+        # Set quotas lower than what mount_b already wrote, it should
+        # refuse to write more once it's seen them
+        self.mount_a.setfattr("./subdir_files", "ceph.quota.max_files", "10")
+        self.mount_a.setfattr("./subdir_data", "ceph.quota.max_bytes", "1048576")
+
+        # Do some writes that would have been okay within the old quota,
+        # but are forbidden under the new quota
+        with self.assertRaises(CommandFailedError):
+            self.mount_b.create_n_files("subdir_files/file", 40)
+        with self.assertRaises(CommandFailedError):
+            self.mount_b.write_n_mb("subdir_data/file", 40)
+
diff --git a/qa/tasks/cephfs/test_readahead.py b/qa/tasks/cephfs/test_readahead.py
new file mode 100644
index 000000000..7e6270f03
--- /dev/null
+++ b/qa/tasks/cephfs/test_readahead.py
@@ -0,0 +1,26 @@
+import logging
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+
+class TestReadahead(CephFSTestCase):
+    def test_flush(self):
+        # Create 32MB file
+        self.mount_a.run_shell(["dd", "if=/dev/urandom", "of=foo", "bs=1M", "count=32"])
+
+        # Unmount and remount the client to flush cache
+        self.mount_a.umount_wait()
+        self.mount_a.mount_wait()
+
+        initial_op_read = self.mount_a.get_op_read_count()
+        self.mount_a.run_shell(["dd", "if=foo", "of=/dev/null", "bs=128k", "count=32"])
+        op_read = self.mount_a.get_op_read_count()
+        self.assertGreaterEqual(op_read, initial_op_read)
+        op_read -= initial_op_read
+        log.info("read operations: {0}".format(op_read))
+
+        # with exponentially increasing readahead, we should see fewer than 10 operations
+        # but this test simply checks if the client is doing a remote read for each local read
+        if op_read >= 32:
+            raise RuntimeError("readahead not working")
diff --git a/qa/tasks/cephfs/test_recovery_fs.py b/qa/tasks/cephfs/test_recovery_fs.py
new file mode 100644
index 000000000..bbcdf9769
--- /dev/null
+++ b/qa/tasks/cephfs/test_recovery_fs.py
@@ -0,0 +1,38 @@
+import logging
+from os.path import join as os_path_join
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+class TestFSRecovery(CephFSTestCase):
+    """
+    Tests for recovering FS after loss of FSMap
+    """
+
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 3
+
+    def test_recover_fs_after_fsmap_removal(self):
+        data_pool = self.fs.get_data_pool_name()
+        metadata_pool = self.fs.get_metadata_pool_name()
+        # write data in mount, and fsync
+        self.mount_a.create_n_files('file_on_fs', 1, sync=True)
+        # faild MDSs to allow removing the file system in the next step
+        self.fs.fail()
+        # Remove file system to lose FSMap and keep the pools intact.
+        # This mimics the scenario where the monitor store is rebuilt
+        # using  OSDs to recover a cluster with corrupt monitor store.
+        # The FSMap is permanently lost, but the FS pools are
+        # recovered/intact
+        self.fs.rm()
+        # Recreate file system with pool and previous fscid
+        self.fs.mon_manager.raw_cluster_cmd(
+            'fs', 'new', self.fs.name, metadata_pool, data_pool,
+            '--recover', '--force', '--fscid', f'{self.fs.id}')
+        self.fs.set_joinable()
+        # Check status of file system
+        self.fs.wait_for_daemons()
+        # check data in file sytem is intact
+        filepath = os_path_join(self.mount_a.hostfs_mntpt, 'file_on_fs_0')
+        self.assertEqual(self.mount_a.read_file(filepath), "0")
diff --git a/qa/tasks/cephfs/test_recovery_pool.py b/qa/tasks/cephfs/test_recovery_pool.py
new file mode 100644
index 000000000..9926b3670
--- /dev/null
+++ b/qa/tasks/cephfs/test_recovery_pool.py
@@ -0,0 +1,203 @@
+"""
+Test our tools for recovering metadata from the data pool into an alternate pool
+"""
+
+import logging
+import traceback
+from collections import namedtuple
+
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+
+ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
+
+
+class OverlayWorkload(object):
+    def __init__(self, orig_fs, recovery_fs, orig_mount, recovery_mount):
+        self._orig_fs = orig_fs
+        self._recovery_fs = recovery_fs
+        self._orig_mount = orig_mount
+        self._recovery_mount = recovery_mount
+        self._initial_state = None
+
+        # Accumulate backtraces for every failed validation, and return them.  Backtraces
+        # are rather verbose, but we only see them when something breaks, and they
+        # let us see which check failed without having to decorate each check with
+        # a string
+        self._errors = []
+
+    def assert_equal(self, a, b):
+        try:
+            if a != b:
+                raise AssertionError("{0} != {1}".format(a, b))
+        except AssertionError as e:
+            self._errors.append(
+                ValidationError(e, traceback.format_exc(3))
+            )
+
+    def write(self):
+        """
+        Write the workload files to the mount
+        """
+        raise NotImplementedError()
+
+    def validate(self):
+        """
+        Read from the mount and validate that the workload files are present (i.e. have
+        survived or been reconstructed from the test scenario)
+        """
+        raise NotImplementedError()
+
+    def damage(self):
+        """
+        Damage the filesystem pools in ways that will be interesting to recover from.  By
+        default just wipe everything in the metadata pool
+        """
+
+        pool = self._orig_fs.get_metadata_pool_name()
+        self._orig_fs.rados(["purge", pool, '--yes-i-really-really-mean-it'])
+
+    def flush(self):
+        """
+        Called after client unmount, after write: flush whatever you want
+        """
+        self._orig_fs.mds_asok(["flush", "journal"])
+        self._recovery_fs.mds_asok(["flush", "journal"])
+
+
+class SimpleOverlayWorkload(OverlayWorkload):
+    """
+    Single file, single directory, check that it gets recovered and so does its size
+    """
+    def write(self):
+        self._orig_mount.run_shell(["mkdir", "subdir"])
+        self._orig_mount.write_n_mb("subdir/sixmegs", 6)
+        self._initial_state = self._orig_mount.stat("subdir/sixmegs")
+
+    def validate(self):
+        self._recovery_mount.run_shell(["ls", "subdir"])
+        st = self._recovery_mount.stat("subdir/sixmegs")
+        self.assert_equal(st['st_size'], self._initial_state['st_size'])
+        return self._errors
+
+class TestRecoveryPool(CephFSTestCase):
+    MDSS_REQUIRED = 2
+    CLIENTS_REQUIRED = 2
+    REQUIRE_RECOVERY_FILESYSTEM = True
+
+    def is_marked_damaged(self, rank):
+        mds_map = self.fs.get_mds_map()
+        return rank in mds_map['damaged']
+
+    def _rebuild_metadata(self, workload, other_pool=None, workers=1):
+        """
+        That when all objects in metadata pool are removed, we can rebuild a metadata pool
+        based on the contents of a data pool, and a client can see and read our files.
+        """
+
+        # First, inject some files
+
+        workload.write()
+
+        # Unmount the client and flush the journal: the tool should also cope with
+        # situations where there is dirty metadata, but we'll test that separately
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+        workload.flush()
+
+        # Create the alternate pool if requested
+        recovery_fs = self.recovery_fs.name
+        recovery_pool = self.recovery_fs.get_metadata_pool_name()
+        self.recovery_fs.data_scan(['init', '--force-init',
+                                    '--filesystem', recovery_fs,
+                                    '--alternate-pool', recovery_pool])
+        self.recovery_fs.mon_manager.raw_cluster_cmd('-s')
+        self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "session"])
+        self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "snap"])
+        self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "inode"])
+
+        # Stop the MDS
+        self.fs.mds_stop() # otherwise MDS will join once the fs is reset
+        self.fs.fail()
+
+        # After recovery, we need the MDS to not be strict about stats (in production these options
+        # are off by default, but in QA we need to explicitly disable them)
+        self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
+        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
+
+        # Apply any data damage the workload wants
+        workload.damage()
+
+        # Reset the MDS map in case multiple ranks were in play: recovery procedure
+        # only understands how to rebuild metadata under rank 0
+        self.fs.reset()
+
+        self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
+        self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
+        self.fs.table_tool([self.fs.name + ":0", "reset", "inode"])
+
+        # Run the recovery procedure
+        if False:
+            with self.assertRaises(CommandFailedError):
+                # Normal reset should fail when no objects are present, we'll use --force instead
+                self.fs.journal_tool(["journal", "reset"], 0)
+
+        self.fs.data_scan(['scan_extents', '--alternate-pool',
+                           recovery_pool, '--filesystem', self.fs.name,
+                           self.fs.get_data_pool_name()])
+        self.fs.data_scan(['scan_inodes', '--alternate-pool',
+                           recovery_pool, '--filesystem', self.fs.name,
+                           '--force-corrupt', '--force-init',
+                           self.fs.get_data_pool_name()])
+        self.fs.journal_tool(['event', 'recover_dentries', 'list',
+                              '--alternate-pool', recovery_pool], 0)
+
+        self.fs.data_scan(['init', '--force-init', '--filesystem',
+                           self.fs.name])
+        self.fs.data_scan(['scan_inodes', '--filesystem', self.fs.name,
+                           '--force-corrupt', '--force-init',
+                           self.fs.get_data_pool_name()])
+        self.fs.journal_tool(['event', 'recover_dentries', 'list'], 0)
+
+        self.recovery_fs.journal_tool(['journal', 'reset', '--force'], 0)
+        self.fs.journal_tool(['journal', 'reset', '--force'], 0)
+        self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired',
+                                            recovery_fs + ":0")
+
+        # Mark the MDS repaired
+        self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
+
+        # Start the MDS
+        self.fs.mds_restart()
+        self.fs.set_joinable()
+        self.recovery_fs.mds_restart()
+        self.fs.wait_for_daemons()
+        self.recovery_fs.wait_for_daemons()
+        status = self.recovery_fs.status()
+        for rank in self.recovery_fs.get_ranks(status=status):
+            self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + rank['name'],
+                                                'injectargs', '--debug-mds=20')
+            self.fs.rank_tell(['scrub', 'start', '/', 'recursive,repair'], rank=rank['rank'], status=status)
+        log.info(str(self.mds_cluster.status()))
+
+        # Mount a client
+        self.mount_a.mount_wait()
+        self.mount_b.mount_wait(cephfs_name=recovery_fs)
+
+        # See that the files are present and correct
+        errors = workload.validate()
+        if errors:
+            log.error("Validation errors found: {0}".format(len(errors)))
+            for e in errors:
+                log.error(e.exception)
+                log.error(e.backtrace)
+            raise AssertionError("Validation failed, first error: {0}\n{1}".format(
+                errors[0].exception, errors[0].backtrace
+            ))
+
+    def test_rebuild_simple(self):
+        self._rebuild_metadata(SimpleOverlayWorkload(self.fs, self.recovery_fs,
+                                                     self.mount_a, self.mount_b))
diff --git a/qa/tasks/cephfs/test_scrub.py b/qa/tasks/cephfs/test_scrub.py
new file mode 100644
index 000000000..dd7c11af5
--- /dev/null
+++ b/qa/tasks/cephfs/test_scrub.py
@@ -0,0 +1,178 @@
+"""
+Test CephFS scrub (distinct from OSD scrub) functionality
+"""
+
+from io import BytesIO
+import logging
+from collections import namedtuple
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
+
+
+class Workload(CephFSTestCase):
+    def __init__(self, test, filesystem, mount):
+        super().__init__()
+        self._test =  test
+        self._mount = mount
+        self._filesystem = filesystem
+        self._initial_state = None
+
+        # Accumulate backtraces for every failed validation, and return them.  Backtraces
+        # are rather verbose, but we only see them when something breaks, and they
+        # let us see which check failed without having to decorate each check with
+        # a string
+        self._errors = []
+
+    def write(self):
+        """
+        Write the workload files to the mount
+        """
+        raise NotImplementedError()
+
+    def validate(self):
+        """
+        Read from the mount and validate that the workload files are present (i.e. have
+        survived or been reconstructed from the test scenario)
+        """
+        raise NotImplementedError()
+
+    def damage(self):
+        """
+        Damage the filesystem pools in ways that will be interesting to recover from.  By
+        default just wipe everything in the metadata pool
+        """
+        # Delete every object in the metadata pool
+        pool = self._filesystem.get_metadata_pool_name()
+        self._filesystem.rados(["purge", pool, '--yes-i-really-really-mean-it'])
+
+    def flush(self):
+        """
+        Called after client unmount, after write: flush whatever you want
+        """
+        self._filesystem.mds_asok(["flush", "journal"])
+
+
+class BacktraceWorkload(Workload):
+    """
+    Single file, single directory, wipe the backtrace and check it.
+    """
+    def write(self):
+        self._mount.run_shell(["mkdir", "subdir"])
+        self._mount.write_n_mb("subdir/sixmegs", 6)
+
+    def validate(self):
+        st = self._mount.stat("subdir/sixmegs")
+        self._filesystem.mds_asok(["flush", "journal"])
+        bt = self._filesystem.read_backtrace(st['st_ino'])
+        parent = bt['ancestors'][0]['dname']
+        self.assertEqual(parent, 'sixmegs')
+        return self._errors
+
+    def damage(self):
+        st = self._mount.stat("subdir/sixmegs")
+        self._filesystem.mds_asok(["flush", "journal"])
+        self._filesystem._write_data_xattr(st['st_ino'], "parent", "")
+
+    def create_files(self, nfiles=1000):
+        self._mount.create_n_files("scrub-new-files/file", nfiles)
+
+
+class DupInodeWorkload(Workload):
+    """
+    Duplicate an inode and try scrubbing it twice."
+    """
+
+    def write(self):
+        self._mount.run_shell(["mkdir", "parent"])
+        self._mount.run_shell(["mkdir", "parent/child"])
+        self._mount.write_n_mb("parent/parentfile", 6)
+        self._mount.write_n_mb("parent/child/childfile", 6)
+
+    def damage(self):
+        self._mount.umount_wait()
+        self._filesystem.mds_asok(["flush", "journal"])
+        self._filesystem.fail()
+        d = self._filesystem.radosmo(["getomapval", "10000000000.00000000", "parentfile_head", "-"])
+        self._filesystem.radosm(["setomapval", "10000000000.00000000", "shadow_head"], stdin=BytesIO(d))
+        self._test.config_set('mds', 'mds_hack_allow_loading_invalid_metadata', True)
+        self._filesystem.set_joinable()
+        self._filesystem.wait_for_daemons()
+
+    def validate(self):
+        out_json = self._filesystem.run_scrub(["start", "/", "recursive,repair"])
+        self.assertNotEqual(out_json, None)
+        self.assertEqual(out_json["return_code"], 0)
+        self.assertEqual(self._filesystem.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
+        self.assertTrue(self._filesystem.are_daemons_healthy())
+        return self._errors
+
+
+class TestScrub(CephFSTestCase):
+    MDSS_REQUIRED = 1
+
+    def setUp(self):
+        super().setUp()
+
+    def _scrub(self, workload, workers=1):
+        """
+        That when all objects in metadata pool are removed, we can rebuild a metadata pool
+        based on the contents of a data pool, and a client can see and read our files.
+        """
+
+        # First, inject some files
+
+        workload.write()
+
+        # are off by default, but in QA we need to explicitly disable them)
+        self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
+        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
+
+        # Apply any data damage the workload wants
+        workload.damage()
+
+        out_json = self.fs.run_scrub(["start", "/", "recursive,repair"])
+        self.assertNotEqual(out_json, None)
+        self.assertEqual(out_json["return_code"], 0)
+        self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
+
+        # See that the files are present and correct
+        errors = workload.validate()
+        if errors:
+            log.error("Validation errors found: {0}".format(len(errors)))
+            for e in errors:
+                log.error(e.exception)
+                log.error(e.backtrace)
+            raise AssertionError("Validation failed, first error: {0}\n{1}".format(
+                errors[0].exception, errors[0].backtrace
+            ))
+
+    def _get_damage_count(self, damage_type='backtrace'):
+        out_json = self.fs.rank_tell(["damage", "ls"])
+        self.assertNotEqual(out_json, None)
+
+        damage_count = 0
+        for it in out_json:
+            if it['damage_type'] == damage_type:
+                damage_count += 1
+        return damage_count
+
+    def _scrub_new_files(self, workload):
+        """
+        That scrubbing new files does not lead to errors
+        """
+        workload.create_files(1000)
+        self.fs.wait_until_scrub_complete()
+        self.assertEqual(self._get_damage_count(), 0)
+
+    def test_scrub_backtrace_for_new_files(self):
+        self._scrub_new_files(BacktraceWorkload(self, self.fs, self.mount_a))
+
+    def test_scrub_backtrace(self):
+        self._scrub(BacktraceWorkload(self, self.fs, self.mount_a))
+
+    def test_scrub_dup_inode(self):
+        self._scrub(DupInodeWorkload(self, self.fs, self.mount_a))
diff --git a/qa/tasks/cephfs/test_scrub_checks.py b/qa/tasks/cephfs/test_scrub_checks.py
new file mode 100644
index 000000000..bcfc2fc9a
--- /dev/null
+++ b/qa/tasks/cephfs/test_scrub_checks.py
@@ -0,0 +1,421 @@
+"""
+MDS admin socket scrubbing-related tests.
+"""
+import json
+import logging
+import errno
+import time
+from teuthology.exceptions import CommandFailedError
+from teuthology.contextutil import safe_while
+import os
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+class TestScrubControls(CephFSTestCase):
+    """
+    Test basic scrub control operations such as abort, pause and resume.
+    """
+
+    MDSS_REQUIRED = 2
+    CLIENTS_REQUIRED = 1
+
+    def _abort_scrub(self, expected):
+        res = self.fs.run_scrub(["abort"])
+        self.assertEqual(res['return_code'], expected)
+    def _pause_scrub(self, expected):
+        res = self.fs.run_scrub(["pause"])
+        self.assertEqual(res['return_code'], expected)
+    def _resume_scrub(self, expected):
+        res = self.fs.run_scrub(["resume"])
+        self.assertEqual(res['return_code'], expected)
+    def _check_task_status(self, expected_status, timo=120):
+        """ check scrub status for current active mds in ceph status """
+        with safe_while(sleep=1, tries=120, action='wait for task status') as proceed:
+            while proceed():
+                active = self.fs.get_active_names()
+                log.debug("current active={0}".format(active))
+                task_status = self.fs.get_task_status("scrub status")
+                try:
+                    if task_status[active[0]].startswith(expected_status):
+                        return True
+                except KeyError:
+                    pass
+
+    def _check_task_status_na(self, timo=120):
+        """ check absence of scrub status in ceph status """
+        with safe_while(sleep=1, tries=120, action='wait for task status') as proceed:
+            while proceed():
+                active = self.fs.get_active_names()
+                log.debug("current active={0}".format(active))
+                task_status = self.fs.get_task_status("scrub status")
+                if not active[0] in task_status:
+                    return True
+
+    def create_scrub_data(self, test_dir):
+        for i in range(32):
+            dirname = "dir.{0}".format(i)
+            dirpath = os.path.join(test_dir, dirname)
+            self.mount_a.run_shell_payload(f"""
+set -e
+mkdir -p {dirpath}
+for ((i = 0; i < 32; i++)); do
+    dd if=/dev/urandom of={dirpath}/filename.$i bs=1M conv=fdatasync count=1
+done
+""")
+
+    def test_scrub_abort(self):
+        test_dir = "scrub_control_test_path"
+        abs_test_path = "/{0}".format(test_dir)
+
+        self.create_scrub_data(test_dir)
+
+        out_json = self.fs.run_scrub(["start", abs_test_path, "recursive"])
+        self.assertNotEqual(out_json, None)
+
+        # abort and verify
+        self._abort_scrub(0)
+        self.fs.wait_until_scrub_complete(sleep=5, timeout=30)
+
+        # sleep enough to fetch updated task status
+        checked = self._check_task_status_na()
+        self.assertTrue(checked)
+
+    def test_scrub_pause_and_resume(self):
+        test_dir = "scrub_control_test_path"
+        abs_test_path = "/{0}".format(test_dir)
+
+        log.info("mountpoint: {0}".format(self.mount_a.mountpoint))
+        client_path = os.path.join(self.mount_a.mountpoint, test_dir)
+        log.info("client_path: {0}".format(client_path))
+
+        self.create_scrub_data(test_dir)
+
+        out_json = self.fs.run_scrub(["start", abs_test_path, "recursive"])
+        self.assertNotEqual(out_json, None)
+
+        # pause and verify
+        self._pause_scrub(0)
+        out_json = self.fs.get_scrub_status()
+        self.assertTrue("PAUSED" in out_json['status'])
+
+        checked = self._check_task_status("paused")
+        self.assertTrue(checked)
+
+        # resume and verify
+        self._resume_scrub(0)
+        out_json = self.fs.get_scrub_status()
+        self.assertFalse("PAUSED" in out_json['status'])
+
+        checked = self._check_task_status_na()
+        self.assertTrue(checked)
+
+    def test_scrub_pause_and_resume_with_abort(self):
+        test_dir = "scrub_control_test_path"
+        abs_test_path = "/{0}".format(test_dir)
+
+        self.create_scrub_data(test_dir)
+
+        out_json = self.fs.run_scrub(["start", abs_test_path, "recursive"])
+        self.assertNotEqual(out_json, None)
+
+        # pause and verify
+        self._pause_scrub(0)
+        out_json = self.fs.get_scrub_status()
+        self.assertTrue("PAUSED" in out_json['status'])
+
+        checked = self._check_task_status("paused")
+        self.assertTrue(checked)
+
+        # abort and verify
+        self._abort_scrub(0)
+        out_json = self.fs.get_scrub_status()
+        self.assertTrue("PAUSED" in out_json['status'])
+        self.assertTrue("0 inodes" in out_json['status'])
+
+        # scrub status should still be paused...
+        checked = self._check_task_status("paused")
+        self.assertTrue(checked)
+
+        # resume and verify
+        self._resume_scrub(0)
+        out_json = self.fs.get_scrub_status()
+        self.assertTrue("no active" in out_json['status'])
+
+        checked = self._check_task_status_na()
+        self.assertTrue(checked)
+
+    def test_scrub_task_status_on_mds_failover(self):
+        (original_active, ) = self.fs.get_active_names()
+        original_standbys = self.mds_cluster.get_standby_daemons()
+
+        test_dir = "scrub_control_test_path"
+        abs_test_path = "/{0}".format(test_dir)
+
+        self.create_scrub_data(test_dir)
+
+        out_json = self.fs.run_scrub(["start", abs_test_path, "recursive"])
+        self.assertNotEqual(out_json, None)
+
+        # pause and verify
+        self._pause_scrub(0)
+        out_json = self.fs.get_scrub_status()
+        self.assertTrue("PAUSED" in out_json['status'])
+
+        checked = self._check_task_status("paused")
+        self.assertTrue(checked)
+
+        # Kill the rank 0
+        self.fs.mds_stop(original_active)
+
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+        def promoted():
+            active = self.fs.get_active_names()
+            return active and active[0] in original_standbys
+
+        log.info("Waiting for promotion of one of the original standbys {0}".format(
+            original_standbys))
+        self.wait_until_true(promoted, timeout=grace*2)
+
+        self._check_task_status_na()
+
+class TestScrubChecks(CephFSTestCase):
+    """
+    Run flush and scrub commands on the specified files in the filesystem. This
+    task will run through a sequence of operations, but it is not comprehensive
+    on its own -- it doesn't manipulate the mds cache state to test on both
+    in- and out-of-memory parts of the hierarchy. So it's designed to be run
+    multiple times within a single test run, so that the test can manipulate
+    memory state.
+
+    Usage:
+    mds_scrub_checks:
+      mds_rank: 0
+      path: path/to/test/dir
+      client: 0
+      run_seq: [0-9]+
+
+    Increment the run_seq on subsequent invocations within a single test run;
+    it uses that value to generate unique folder and file names.
+    """
+
+    MDSS_REQUIRED = 1
+    CLIENTS_REQUIRED = 1
+
+    def test_scrub_checks(self):
+        self._checks(0)
+        self._checks(1)
+
+    def _checks(self, run_seq):
+        mds_rank = 0
+        test_dir = "scrub_test_path"
+
+        abs_test_path = "/{0}".format(test_dir)
+
+        log.info("mountpoint: {0}".format(self.mount_a.mountpoint))
+        client_path = os.path.join(self.mount_a.mountpoint, test_dir)
+        log.info("client_path: {0}".format(client_path))
+
+        log.info("Cloning repo into place")
+        repo_path = TestScrubChecks.clone_repo(self.mount_a, client_path)
+
+        log.info("Initiating mds_scrub_checks on mds.{id_} test_path {path}, run_seq {seq}".format(
+            id_=mds_rank, path=abs_test_path, seq=run_seq)
+        )
+
+
+        success_validator = lambda j, r: self.json_validator(j, r, "return_code", 0)
+
+        nep = "{test_path}/i/dont/exist".format(test_path=abs_test_path)
+        self.asok_command(mds_rank, "flush_path {nep}".format(nep=nep),
+                          lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT))
+        self.tell_command(mds_rank, "scrub start {nep}".format(nep=nep),
+                          lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT))
+
+        test_repo_path = "{test_path}/ceph-qa-suite".format(test_path=abs_test_path)
+        dirpath = "{repo_path}/suites".format(repo_path=test_repo_path)
+
+        if run_seq == 0:
+            log.info("First run: flushing {dirpath}".format(dirpath=dirpath))
+            command = "flush_path {dirpath}".format(dirpath=dirpath)
+            self.asok_command(mds_rank, command, success_validator)
+        command = "scrub start {dirpath}".format(dirpath=dirpath)
+        self.tell_command(mds_rank, command, success_validator)
+
+        filepath = "{repo_path}/suites/fs/verify/validater/valgrind.yaml".format(
+            repo_path=test_repo_path)
+        if run_seq == 0:
+            log.info("First run: flushing {filepath}".format(filepath=filepath))
+            command = "flush_path {filepath}".format(filepath=filepath)
+            self.asok_command(mds_rank, command, success_validator)
+        command = "scrub start {filepath}".format(filepath=filepath)
+        self.tell_command(mds_rank, command, success_validator)
+
+        if run_seq == 0:
+            log.info("First run: flushing base dir /")
+            command = "flush_path /"
+            self.asok_command(mds_rank, command, success_validator)
+        command = "scrub start /"
+        self.tell_command(mds_rank, command, success_validator)
+
+        new_dir = "{repo_path}/new_dir_{i}".format(repo_path=repo_path, i=run_seq)
+        test_new_dir = "{repo_path}/new_dir_{i}".format(repo_path=test_repo_path,
+                                                        i=run_seq)
+        self.mount_a.run_shell(["mkdir", new_dir])
+        command = "flush_path {dir}".format(dir=test_new_dir)
+        self.asok_command(mds_rank, command, success_validator)
+
+        new_file = "{repo_path}/new_file_{i}".format(repo_path=repo_path,
+                                                     i=run_seq)
+        test_new_file = "{repo_path}/new_file_{i}".format(repo_path=test_repo_path,
+                                                          i=run_seq)
+        self.mount_a.write_n_mb(new_file, 1)
+
+        command = "flush_path {file}".format(file=test_new_file)
+        self.asok_command(mds_rank, command, success_validator)
+
+        # check that scrub fails on errors
+        ino = self.mount_a.path_to_ino(new_file)
+        rados_obj_name = "{ino:x}.00000000".format(ino=ino)
+        command = "scrub start {file}".format(file=test_new_file)
+
+        def _check_and_clear_damage(ino, dtype):
+            all_damage = self.fs.rank_tell(["damage", "ls"], mds_rank)
+            damage = [d for d in all_damage if d['ino'] == ino and d['damage_type'] == dtype]
+            for d in damage:
+                self.fs.mon_manager.raw_cluster_cmd(
+                    'tell', 'mds.{0}'.format(self.fs.get_active_names()[mds_rank]),
+                    "damage", "rm", str(d['id']))
+            return len(damage) > 0
+
+        # Missing parent xattr
+        self.assertFalse(_check_and_clear_damage(ino, "backtrace"));
+        self.fs.rados(["rmxattr", rados_obj_name, "parent"], pool=self.fs.get_data_pool_name())
+        self.tell_command(mds_rank, command, success_validator)
+        self.fs.wait_until_scrub_complete(sleep=5, timeout=30)
+        self.assertTrue(_check_and_clear_damage(ino, "backtrace"));
+
+        command = "flush_path /"
+        self.asok_command(mds_rank, command, success_validator)
+
+    def test_scrub_repair(self):
+        mds_rank = 0
+        test_dir = "scrub_repair_path"
+
+        self.mount_a.run_shell(["mkdir", test_dir])
+        self.mount_a.run_shell(["touch", "{0}/file".format(test_dir)])
+        dir_objname = "{:x}.00000000".format(self.mount_a.path_to_ino(test_dir))
+
+        self.mount_a.umount_wait()
+
+        # flush journal entries to dirfrag objects, and expire journal
+        self.fs.mds_asok(['flush', 'journal'])
+        self.fs.mds_stop()
+
+        # remove the dentry from dirfrag, cause incorrect fragstat/rstat
+        self.fs.radosm(["rmomapkey", dir_objname, "file_head"])
+
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        self.mount_a.mount_wait()
+
+        # fragstat indicates the directory is not empty, rmdir should fail
+        with self.assertRaises(CommandFailedError) as ar:
+            self.mount_a.run_shell(["rmdir", test_dir])
+        self.assertEqual(ar.exception.exitstatus, 1)
+
+        self.tell_command(mds_rank, "scrub start /{0} repair".format(test_dir),
+                          lambda j, r: self.json_validator(j, r, "return_code", 0))
+
+        # wait a few second for background repair
+        time.sleep(10)
+
+        # fragstat should be fixed
+        self.mount_a.run_shell(["rmdir", test_dir])
+
+    @staticmethod
+    def json_validator(json_out, rc, element, expected_value):
+        if rc != 0:
+            return False, "asok command returned error {rc}".format(rc=rc)
+        element_value = json_out.get(element)
+        if element_value != expected_value:
+            return False, "unexpectedly got {jv} instead of {ev}!".format(
+                jv=element_value, ev=expected_value)
+        return True, "Succeeded"
+
+    def tell_command(self, mds_rank, command, validator):
+        log.info("Running command '{command}'".format(command=command))
+
+        command_list = command.split()
+        jout = self.fs.rank_tell(command_list, mds_rank)
+
+        log.info("command '{command}' returned '{jout}'".format(
+                     command=command, jout=jout))
+
+        success, errstring = validator(jout, 0)
+        if not success:
+            raise AsokCommandFailedError(command, 0, jout, errstring)
+        return jout
+
+    def asok_command(self, mds_rank, command, validator):
+        log.info("Running command '{command}'".format(command=command))
+
+        command_list = command.split()
+
+        # we just assume there's an active mds for every rank
+        mds_id = self.fs.get_active_names()[mds_rank]
+        proc = self.fs.mon_manager.admin_socket('mds', mds_id,
+                                                command_list, check_status=False)
+        rout = proc.exitstatus
+        sout = proc.stdout.getvalue()
+
+        if sout.strip():
+            jout = json.loads(sout)
+        else:
+            jout = None
+
+        log.info("command '{command}' got response code '{rout}' and stdout '{sout}'".format(
+            command=command, rout=rout, sout=sout))
+
+        success, errstring = validator(jout, rout)
+
+        if not success:
+            raise AsokCommandFailedError(command, rout, jout, errstring)
+
+        return jout
+
+    @staticmethod
+    def clone_repo(client_mount, path):
+        repo = "ceph-qa-suite"
+        repo_path = os.path.join(path, repo)
+        client_mount.run_shell(["mkdir", "-p", path])
+
+        try:
+            client_mount.stat(repo_path)
+        except CommandFailedError:
+            client_mount.run_shell([
+                "git", "clone", '--branch', 'giant',
+                "http://github.com/ceph/{repo}".format(repo=repo),
+                "{path}/{repo}".format(path=path, repo=repo)
+            ])
+
+        return repo_path
+
+
+class AsokCommandFailedError(Exception):
+    """
+    Exception thrown when we get an unexpected response
+    on an admin socket command
+    """
+
+    def __init__(self, command, rc, json_out, errstring):
+        self.command = command
+        self.rc = rc
+        self.json = json_out
+        self.errstring = errstring
+
+    def __str__(self):
+        return "Admin socket: {command} failed with rc={rc} json output={json}, because '{es}'".format(
+            command=self.command, rc=self.rc, json=self.json, es=self.errstring)
diff --git a/qa/tasks/cephfs/test_sessionmap.py b/qa/tasks/cephfs/test_sessionmap.py
new file mode 100644
index 000000000..ad6fd1d60
--- /dev/null
+++ b/qa/tasks/cephfs/test_sessionmap.py
@@ -0,0 +1,232 @@
+import time
+import json
+import logging
+
+from tasks.cephfs.fuse_mount import FuseMount
+from teuthology.exceptions import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+
+class TestSessionMap(CephFSTestCase):
+    CLIENTS_REQUIRED = 2
+    MDSS_REQUIRED = 2
+
+    def test_tell_session_drop(self):
+        """
+        That when a `tell` command is sent using the python CLI,
+        its MDS session is gone after it terminates
+        """
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        status = self.fs.status()
+        self.fs.rank_tell(["session", "ls"], status=status)
+
+        ls_data = self.fs.rank_asok(['session', 'ls'], status=status)
+        self.assertEqual(len(ls_data), 0)
+
+    def _get_connection_count(self, status=None):
+        perf = self.fs.rank_asok(["perf", "dump"], status=status)
+        conn = 0
+        for module, dump in perf.items():
+            if "AsyncMessenger::Worker" in module:
+                conn += dump['msgr_active_connections']
+        return conn
+
+    def test_tell_conn_close(self):
+        """
+        That when a `tell` command is sent using the python CLI,
+        the conn count goes back to where it started (i.e. we aren't
+        leaving connections open)
+        """
+        self.config_set('mds', 'ms_async_reap_threshold', '1')
+
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        status = self.fs.status()
+        s = self._get_connection_count(status=status)
+        self.fs.rank_tell(["session", "ls"], status=status)
+        self.wait_until_true(
+            lambda: self._get_connection_count(status=status) == s,
+            timeout=30
+        )
+
+    def test_mount_conn_close(self):
+        """
+        That when a client unmounts, the thread count on the MDS goes back
+        to what it was before the client mounted
+        """
+        self.config_set('mds', 'ms_async_reap_threshold', '1')
+
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        status = self.fs.status()
+        s = self._get_connection_count(status=status)
+        self.mount_a.mount_wait()
+        self.assertGreater(self._get_connection_count(status=status), s)
+        self.mount_a.umount_wait()
+        self.wait_until_true(
+            lambda: self._get_connection_count(status=status) == s,
+            timeout=30
+        )
+
+    def test_version_splitting(self):
+        """
+        That when many sessions are updated, they are correctly
+        split into multiple versions to obey mds_sessionmap_keys_per_op
+        """
+
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        # Configure MDS to write one OMAP key at once
+        self.set_conf('mds', 'mds_sessionmap_keys_per_op', 1)
+        self.fs.mds_fail_restart()
+        status = self.fs.wait_for_daemons()
+
+        # Bring the clients back
+        self.mount_a.mount_wait()
+        self.mount_b.mount_wait()
+
+        # See that they've got sessions
+        self.assert_session_count(2, mds_id=self.fs.get_rank(status=status)['name'])
+
+        # See that we persist their sessions
+        self.fs.rank_asok(["flush", "journal"], rank=0, status=status)
+        table_json = json.loads(self.fs.table_tool(["0", "show", "session"]))
+        log.info("SessionMap: {0}".format(json.dumps(table_json, indent=2)))
+        self.assertEqual(table_json['0']['result'], 0)
+        self.assertEqual(len(table_json['0']['data']['sessions']), 2)
+
+        # Now, induce a "force_open_sessions" event by exporting a dir
+        self.mount_a.run_shell(["mkdir", "bravo"])
+        self.mount_a.run_shell(["touch", "bravo/file_a"])
+        self.mount_b.run_shell(["touch", "bravo/file_b"])
+
+        self.fs.set_max_mds(2)
+        status = self.fs.wait_for_daemons()
+
+        def get_omap_wrs():
+            return self.fs.rank_asok(['perf', 'dump', 'objecter'], rank=1, status=status)['objecter']['omap_wr']
+
+        # Flush so that there are no dirty sessions on rank 1
+        self.fs.rank_asok(["flush", "journal"], rank=1, status=status)
+
+        # Export so that we get a force_open to rank 1 for the two sessions from rank 0
+        initial_omap_wrs = get_omap_wrs()
+        self.fs.rank_asok(['export', 'dir', '/bravo', '1'], rank=0, status=status)
+
+        # This is the critical (if rather subtle) check: that in the process of doing an export dir,
+        # we hit force_open_sessions, and as a result we end up writing out the sessionmap.  There
+        # will be two sessions dirtied here, and because we have set keys_per_op to 1, we should see
+        # a single session get written out (the first of the two, triggered by the second getting marked
+        # dirty)
+        # The number of writes is two per session, because the header (sessionmap version) update and
+        # KV write both count. Also, multiply by 2 for each openfile table update.
+        self.wait_until_true(
+            lambda: get_omap_wrs() - initial_omap_wrs == 2*2,
+            timeout=30  # Long enough for an export to get acked
+        )
+
+        # Now end our sessions and check the backing sessionmap is updated correctly
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        # In-memory sessionmap check
+        self.assert_session_count(0, mds_id=self.fs.get_rank(status=status)['name'])
+
+        # On-disk sessionmap check
+        self.fs.rank_asok(["flush", "journal"], rank=0, status=status)
+        table_json = json.loads(self.fs.table_tool(["0", "show", "session"]))
+        log.info("SessionMap: {0}".format(json.dumps(table_json, indent=2)))
+        self.assertEqual(table_json['0']['result'], 0)
+        self.assertEqual(len(table_json['0']['data']['sessions']), 0)
+
+    def _configure_auth(self, mount, id_name, mds_caps, osd_caps=None, mon_caps=None):
+        """
+        Set up auth credentials for a client mount, and write out the keyring
+        for the client to use.
+        """
+
+        if osd_caps is None:
+            osd_caps = "allow rw"
+
+        if mon_caps is None:
+            mon_caps = "allow r"
+
+        out = self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "get-or-create", "client.{name}".format(name=id_name),
+            "mds", mds_caps,
+            "osd", osd_caps,
+            "mon", mon_caps
+        )
+        mount.client_id = id_name
+        mount.client_remote.write_file(mount.get_keyring_path(), out, sudo=True)
+        self.set_conf("client.{name}".format(name=id_name), "keyring", mount.get_keyring_path())
+
+    def test_session_reject(self):
+        if not isinstance(self.mount_a, FuseMount):
+            self.skipTest("Requires FUSE client to inject client metadata")
+
+        self.mount_a.run_shell(["mkdir", "foo"])
+        self.mount_a.run_shell(["mkdir", "foo/bar"])
+        self.mount_a.umount_wait()
+
+        # Mount B will be my rejected client
+        self.mount_b.umount_wait()
+
+        # Configure a client that is limited to /foo/bar
+        self._configure_auth(self.mount_b, "badguy", "allow rw path=/foo/bar")
+        # Check he can mount that dir and do IO
+        self.mount_b.mount_wait(cephfs_mntpt="/foo/bar")
+        self.mount_b.create_destroy()
+        self.mount_b.umount_wait()
+
+        # Configure the client to claim that its mount point metadata is /baz
+        self.set_conf("client.badguy", "client_metadata", "root=/baz")
+        # Try to mount the client, see that it fails
+        with self.assert_cluster_log("client session with non-allowable root '/baz' denied"):
+            with self.assertRaises(CommandFailedError):
+                self.mount_b.mount_wait(cephfs_mntpt="/foo/bar")
+
+    def test_session_evict_blocklisted(self):
+        """
+        Check that mds evicts blocklisted client
+        """
+        if not isinstance(self.mount_a, FuseMount):
+            self.skipTest("Requires FUSE client to use "
+                          "mds_cluster.is_addr_blocklisted()")
+
+        self.fs.set_max_mds(2)
+        status = self.fs.wait_for_daemons()
+
+        self.mount_a.run_shell_payload("mkdir {d0,d1} && touch {d0,d1}/file")
+        self.mount_a.setfattr("d0", "ceph.dir.pin", "0")
+        self.mount_a.setfattr("d1", "ceph.dir.pin", "1")
+        self._wait_subtrees([('/d0', 0), ('/d1', 1)], status=status)
+
+        self.mount_a.run_shell(["touch", "d0/f0"])
+        self.mount_a.run_shell(["touch", "d1/f0"])
+        self.mount_b.run_shell(["touch", "d0/f1"])
+        self.mount_b.run_shell(["touch", "d1/f1"])
+
+        self.assert_session_count(2, mds_id=self.fs.get_rank(rank=0, status=status)['name'])
+        self.assert_session_count(2, mds_id=self.fs.get_rank(rank=1, status=status)['name'])
+
+        mount_a_client_id = self.mount_a.get_global_id()
+        self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id],
+                         mds_id=self.fs.get_rank(rank=0, status=status)['name'])
+        self.wait_until_true(lambda: self.mds_cluster.is_addr_blocklisted(
+            self.mount_a.get_global_addr()), timeout=30)
+
+        # 10 seconds should be enough for evicting client
+        time.sleep(10)
+        self.assert_session_count(1, mds_id=self.fs.get_rank(rank=0, status=status)['name'])
+        self.assert_session_count(1, mds_id=self.fs.get_rank(rank=1, status=status)['name'])
+
+        self.mount_a.kill_cleanup()
+        self.mount_a.mount_wait()
diff --git a/qa/tasks/cephfs/test_snap_schedules.py b/qa/tasks/cephfs/test_snap_schedules.py
new file mode 100644
index 000000000..3ccd4d592
--- /dev/null
+++ b/qa/tasks/cephfs/test_snap_schedules.py
@@ -0,0 +1,448 @@
+import os
+import json
+import time
+import errno
+import logging
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.exceptions import CommandFailedError
+from datetime import datetime, timedelta
+
+log = logging.getLogger(__name__)
+
+def extract_schedule_and_retention_spec(spec=[]):
+    schedule = set([s[0] for s in spec])
+    retention = set([s[1] for s in spec])
+    return (schedule, retention)
+
+def seconds_upto_next_schedule(time_from, timo):
+    ts = int(time_from)
+    return ((int(ts / 60) * 60) + timo) - ts
+
+class TestSnapSchedules(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+
+    TEST_VOLUME_NAME = 'snap_vol'
+    TEST_DIRECTORY = 'snap_test_dir1'
+
+    # this should be in sync with snap_schedule format
+    SNAPSHOT_TS_FORMAT = '%Y-%m-%d-%H_%M_%S'
+
+    def check_scheduled_snapshot(self, exec_time, timo):
+        now = time.time()
+        delta = now - exec_time
+        log.debug(f'exec={exec_time}, now = {now}, timo = {timo}')
+        # tolerate snapshot existance in the range [-5,+5]
+        self.assertTrue((delta <= timo + 5) and (delta >= timo - 5))
+
+    def _fs_cmd(self, *args):
+        return self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", *args)
+
+    def fs_snap_schedule_cmd(self, *args, **kwargs):
+        fs = kwargs.pop('fs', self.volname)
+        args += ('--fs', fs)
+        if 'format' in kwargs:
+            fmt = kwargs.pop('format')
+            args += ('--format', fmt)
+        for name, val in kwargs.items():
+            args += (str(val),)
+        res = self._fs_cmd('snap-schedule', *args)
+        log.debug(f'res={res}')
+        return res
+
+    def _create_or_reuse_test_volume(self):
+        result = json.loads(self._fs_cmd("volume", "ls"))
+        if len(result) == 0:
+            self.vol_created = True
+            self.volname = TestSnapSchedules.TEST_VOLUME_NAME
+            self._fs_cmd("volume", "create", self.volname)
+        else:
+            self.volname = result[0]['name']
+
+    def _enable_snap_schedule(self):
+        return self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable", "snap_schedule")
+
+    def _disable_snap_schedule(self):
+        return self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "disable", "snap_schedule")
+
+    def _allow_minute_granularity_snapshots(self):
+        self.config_set('mgr', 'mgr/snap_schedule/allow_m_granularity', True)
+
+    def setUp(self):
+        super(TestSnapSchedules, self).setUp()
+        self.volname = None
+        self.vol_created = False
+        self._create_or_reuse_test_volume()
+        self.create_cbks = []
+        self.remove_cbks = []
+        # used to figure out which snapshots are created/deleted
+        self.snapshots = set()
+        self._enable_snap_schedule()
+        self._allow_minute_granularity_snapshots()
+
+    def tearDown(self):
+        if self.vol_created:
+            self._delete_test_volume()
+        self._disable_snap_schedule()
+        super(TestSnapSchedules, self).tearDown()
+
+    def _schedule_to_timeout(self, schedule):
+        mult = schedule[-1]
+        period = int(schedule[0:-1])
+        if mult == 'M':
+            return period * 60
+        elif mult == 'h':
+            return period * 60 * 60
+        elif mult == 'd':
+            return period * 60 * 60 * 24
+        elif mult == 'w':
+            return period * 60 * 60 * 24 * 7
+        else:
+            raise RuntimeError('schedule multiplier not recognized')
+
+    def add_snap_create_cbk(self, cbk):
+        self.create_cbks.append(cbk)
+    def remove_snap_create_cbk(self, cbk):
+        self.create_cbks.remove(cbk)
+
+    def add_snap_remove_cbk(self, cbk):
+        self.remove_cbks.append(cbk)
+    def remove_snap_remove_cbk(self, cbk):
+        self.remove_cbks.remove(cbk)
+
+    def assert_if_not_verified(self):
+        self.assertTrue(len(self.create_cbks) == 0 and len(self.remove_cbks) == 0)
+
+    def verify(self, dir_path, max_trials):
+        trials = 0
+        snap_path = "{0}/.snap".format(dir_path)
+        while (len(self.create_cbks) or len(self.remove_cbks)) and trials < max_trials:
+            snapshots = set(self.mount_a.ls(path=snap_path))
+            added = snapshots - self.snapshots
+            removed = self.snapshots - snapshots
+            if added:
+                for cbk in list(self.create_cbks):
+                    res = cbk(list(added))
+                    if res:
+                        self.remove_snap_create_cbk(cbk)
+                        break
+            if removed:
+                for cbk in list(self.remove_cbks):
+                    res = cbk(list(removed))
+                    if res:
+                        self.remove_snap_remove_cbk(cbk)
+                        break
+            self.snapshots = snapshots
+            trials += 1
+            time.sleep(1)
+
+    def calc_wait_time_and_snap_name(self, snap_sched_exec_epoch, schedule):
+        timo = self._schedule_to_timeout(schedule)
+        # calculate wait time upto the next minute
+        wait_timo = seconds_upto_next_schedule(snap_sched_exec_epoch, timo)
+
+        # expected "scheduled" snapshot name
+        ts_name = (datetime.utcfromtimestamp(snap_sched_exec_epoch)
+                   + timedelta(seconds=wait_timo)).strftime(TestSnapSchedules.SNAPSHOT_TS_FORMAT)
+        return (wait_timo, ts_name)
+
+    def verify_schedule(self, dir_path, schedules, retentions=[]):
+        log.debug(f'expected_schedule: {schedules}, expected_retention: {retentions}')
+
+        result = self.fs_snap_schedule_cmd('list', path=dir_path, format='json')
+        json_res = json.loads(result)
+        log.debug(f'json_res: {json_res}')
+
+        for schedule in schedules:
+            self.assertTrue(schedule in json_res['schedule'])
+        for retention in retentions:
+            self.assertTrue(retention in json_res['retention'])
+
+    def remove_snapshots(self, dir_path):
+        snap_path = f'{dir_path}/.snap'
+
+        snapshots = self.mount_a.ls(path=snap_path)
+        for snapshot in snapshots:
+            snapshot_path = os.path.join(snap_path, snapshot)
+            log.debug(f'removing snapshot: {snapshot_path}')
+            self.mount_a.run_shell(['rmdir', snapshot_path])
+
+    def test_non_existent_snap_schedule_list(self):
+        """Test listing snap schedules on a non-existing filesystem path failure"""
+        try:
+            self.fs_snap_schedule_cmd('list', path=TestSnapSchedules.TEST_DIRECTORY)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOENT:
+                raise RuntimeError('incorrect errno when listing a non-existing snap schedule')
+        else:
+            raise RuntimeError('expected "fs snap-schedule list" to fail')
+
+    def test_non_existent_schedule(self):
+        """Test listing non-existing snap schedules failure"""
+        self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY])
+
+        try:
+            self.fs_snap_schedule_cmd('list', path=TestSnapSchedules.TEST_DIRECTORY)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOENT:
+                raise RuntimeError('incorrect errno when listing a non-existing snap schedule')
+        else:
+            raise RuntimeError('expected "fs snap-schedule list" returned fail')
+
+        self.mount_a.run_shell(['rmdir', TestSnapSchedules.TEST_DIRECTORY])
+
+    def test_snap_schedule_list_post_schedule_remove(self):
+        """Test listing snap schedules post removal of a schedule"""
+        self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY])
+
+        self.fs_snap_schedule_cmd('add', path=TestSnapSchedules.TEST_DIRECTORY, snap_schedule='1h')
+
+        self.fs_snap_schedule_cmd('remove', path=TestSnapSchedules.TEST_DIRECTORY)
+
+        try:
+            self.fs_snap_schedule_cmd('list', path=TestSnapSchedules.TEST_DIRECTORY)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOENT:
+                raise RuntimeError('incorrect errno when listing a non-existing snap schedule')
+        else:
+            raise RuntimeError('"fs snap-schedule list" returned error')
+
+        self.mount_a.run_shell(['rmdir', TestSnapSchedules.TEST_DIRECTORY])
+
+    def test_snap_schedule(self):
+        """Test existence of a scheduled snapshot"""
+        self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY])
+
+        # set a schedule on the dir
+        self.fs_snap_schedule_cmd('add', path=TestSnapSchedules.TEST_DIRECTORY, snap_schedule='1M')
+        exec_time = time.time()
+
+        timo, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M')
+        log.debug(f'expecting snap {TestSnapSchedules.TEST_DIRECTORY}/.snap/scheduled-{snap_sfx} in ~{timo}s...')
+        to_wait = timo + 2 # some leeway to avoid false failures...
+
+        # verify snapshot schedule
+        self.verify_schedule(TestSnapSchedules.TEST_DIRECTORY, ['1M'])
+
+        def verify_added(snaps_added):
+            log.debug(f'snapshots added={snaps_added}')
+            self.assertEqual(len(snaps_added), 1)
+            snapname = snaps_added[0]
+            if snapname.startswith('scheduled-'):
+                if snapname[10:26] == snap_sfx[:16]:
+                    self.check_scheduled_snapshot(exec_time, timo)
+                    return True
+            return False
+        self.add_snap_create_cbk(verify_added)
+        self.verify(TestSnapSchedules.TEST_DIRECTORY, to_wait)
+        self.assert_if_not_verified()
+
+        # remove snapshot schedule
+        self.fs_snap_schedule_cmd('remove', path=TestSnapSchedules.TEST_DIRECTORY)
+
+        # remove all scheduled snapshots
+        self.remove_snapshots(TestSnapSchedules.TEST_DIRECTORY)
+
+        self.mount_a.run_shell(['rmdir', TestSnapSchedules.TEST_DIRECTORY])
+
+    def test_multi_snap_schedule(self):
+        """Test exisitence of multiple scheduled snapshots"""
+        self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY])
+
+        # set schedules on the dir
+        self.fs_snap_schedule_cmd('add', path=TestSnapSchedules.TEST_DIRECTORY, snap_schedule='1M')
+        self.fs_snap_schedule_cmd('add', path=TestSnapSchedules.TEST_DIRECTORY, snap_schedule='2M')
+        exec_time = time.time()
+
+        timo_1, snap_sfx_1 = self.calc_wait_time_and_snap_name(exec_time, '1M')
+        log.debug(f'expecting snap {TestSnapSchedules.TEST_DIRECTORY}/.snap/scheduled-{snap_sfx_1} in ~{timo_1}s...')
+        timo_2, snap_sfx_2 = self.calc_wait_time_and_snap_name(exec_time, '2M')
+        log.debug(f'expecting snap {TestSnapSchedules.TEST_DIRECTORY}/.snap/scheduled-{snap_sfx_2} in ~{timo_2}s...')
+        to_wait = timo_2 + 2 # use max timeout
+
+        # verify snapshot schedule
+        self.verify_schedule(TestSnapSchedules.TEST_DIRECTORY, ['1M', '2M'])
+
+        def verify_added_1(snaps_added):
+            log.debug(f'snapshots added={snaps_added}')
+            self.assertEqual(len(snaps_added), 1)
+            snapname = snaps_added[0]
+            if snapname.startswith('scheduled-'):
+                if snapname[10:26] == snap_sfx_1[:16]:
+                    self.check_scheduled_snapshot(exec_time, timo_1)
+                    return True
+            return False
+        def verify_added_2(snaps_added):
+            log.debug(f'snapshots added={snaps_added}')
+            self.assertEqual(len(snaps_added), 1)
+            snapname = snaps_added[0]
+            if snapname.startswith('scheduled-'):
+                if snapname[10:26] == snap_sfx_2[:16]:
+                    self.check_scheduled_snapshot(exec_time, timo_2)
+                    return True
+            return False
+        self.add_snap_create_cbk(verify_added_1)
+        self.add_snap_create_cbk(verify_added_2)
+        self.verify(TestSnapSchedules.TEST_DIRECTORY, to_wait)
+        self.assert_if_not_verified()
+
+        # remove snapshot schedule
+        self.fs_snap_schedule_cmd('remove', path=TestSnapSchedules.TEST_DIRECTORY)
+
+        # remove all scheduled snapshots
+        self.remove_snapshots(TestSnapSchedules.TEST_DIRECTORY)
+
+        self.mount_a.run_shell(['rmdir', TestSnapSchedules.TEST_DIRECTORY])
+
+    def test_snap_schedule_with_retention(self):
+        """Test scheduled snapshots along with rentention policy"""
+        self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY])
+
+        # set a schedule on the dir
+        self.fs_snap_schedule_cmd('add', path=TestSnapSchedules.TEST_DIRECTORY, snap_schedule='1M')
+        self.fs_snap_schedule_cmd('retention', 'add', path=TestSnapSchedules.TEST_DIRECTORY, retention_spec_or_period='1M')
+        exec_time = time.time()
+
+        timo_1, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M')
+        log.debug(f'expecting snap {TestSnapSchedules.TEST_DIRECTORY}/.snap/scheduled-{snap_sfx} in ~{timo_1}s...')
+        to_wait = timo_1 + 2 # some leeway to avoid false failures...
+
+        # verify snapshot schedule
+        self.verify_schedule(TestSnapSchedules.TEST_DIRECTORY, ['1M'], retentions=[{'M':1}])
+
+        def verify_added(snaps_added):
+            log.debug(f'snapshots added={snaps_added}')
+            self.assertEqual(len(snaps_added), 1)
+            snapname = snaps_added[0]
+            if snapname.startswith('scheduled-'):
+                if snapname[10:26] == snap_sfx[:16]:
+                    self.check_scheduled_snapshot(exec_time, timo_1)
+                    return True
+            return False
+        self.add_snap_create_cbk(verify_added)
+        self.verify(TestSnapSchedules.TEST_DIRECTORY, to_wait)
+        self.assert_if_not_verified()
+
+        timo_2 = timo_1 + 60 # expected snapshot removal timeout
+        def verify_removed(snaps_removed):
+            log.debug(f'snapshots removed={snaps_removed}')
+            self.assertEqual(len(snaps_removed), 1)
+            snapname = snaps_removed[0]
+            if snapname.startswith('scheduled-'):
+                if snapname[10:26] == snap_sfx[:16]:
+                    self.check_scheduled_snapshot(exec_time, timo_2)
+                    return True
+            return False
+        log.debug(f'expecting removal of snap {TestSnapSchedules.TEST_DIRECTORY}/.snap/scheduled-{snap_sfx} in ~{timo_2}s...')
+        to_wait = timo_2
+        self.add_snap_remove_cbk(verify_removed)
+        self.verify(TestSnapSchedules.TEST_DIRECTORY, to_wait+2)
+        self.assert_if_not_verified()
+
+        # remove snapshot schedule
+        self.fs_snap_schedule_cmd('remove', path=TestSnapSchedules.TEST_DIRECTORY)
+
+        # remove all scheduled snapshots
+        self.remove_snapshots(TestSnapSchedules.TEST_DIRECTORY)
+
+        self.mount_a.run_shell(['rmdir', TestSnapSchedules.TEST_DIRECTORY])
+
+    def get_snap_stats(self, dir_path):
+        snap_path = f"{dir_path}/.snap"[1:]
+        snapshots = self.mount_a.ls(path=snap_path)
+        fs_count = len(snapshots)
+        log.debug(f'snapshots: {snapshots}');
+
+        result = self.fs_snap_schedule_cmd('status', path=dir_path,
+                                           format='json')
+        json_res = json.loads(result)[0]
+        db_count = int(json_res['created_count'])
+        log.debug(f'json_res: {json_res}')
+
+        snap_stats = dict()
+        snap_stats['fs_count'] = fs_count
+        snap_stats['db_count'] = db_count
+
+        return snap_stats
+
+    def verify_snap_stats(self, dir_path):
+        snap_stats = self.get_snap_stats(dir_path)
+        self.assertTrue(snap_stats['fs_count'] == snap_stats['db_count'])
+
+    def test_concurrent_snap_creates(self):
+        """Test concurrent snap creates in same file-system without db issues"""
+        """
+        Test snap creates at same cadence on same fs to verify correct stats.
+        A single SQLite DB Connection handle cannot be used to run concurrent
+        transactions and results transaction aborts. This test makes sure that
+        proper care has been taken in the code to avoid such situation by
+        verifying number of dirs created on the file system with the
+        created_count in the schedule_meta table for the specific path.
+        """
+        self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY])
+
+        testdirs = []
+        for d in range(10):
+            testdirs.append(os.path.join("/", TestSnapSchedules.TEST_DIRECTORY, "dir" + str(d)))
+
+        for d in testdirs:
+            self.mount_a.run_shell(['mkdir', '-p', d[1:]])
+            self.fs_snap_schedule_cmd('add', path=d, snap_schedule='1M')
+
+        exec_time = time.time()
+        timo_1, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M')
+
+        for d in testdirs:
+            self.fs_snap_schedule_cmd('activate', path=d, snap_schedule='1M')
+
+        # we wait for 10 snaps to be taken
+        wait_time = timo_1 + 10 * 60 + 15
+        time.sleep(wait_time)
+
+        for d in testdirs:
+            self.fs_snap_schedule_cmd('deactivate', path=d, snap_schedule='1M')
+
+        for d in testdirs:
+            self.verify_snap_stats(d)
+
+        for d in testdirs:
+            self.fs_snap_schedule_cmd('remove', path=d, snap_schedule='1M')
+            self.remove_snapshots(d[1:])
+            self.mount_a.run_shell(['rmdir', d[1:]])
+
+    def test_snap_schedule_with_mgr_restart(self):
+        """Test that snap schedule is resumed after mgr restart"""
+        self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY])
+        testdir = os.path.join("/", TestSnapSchedules.TEST_DIRECTORY, "test_restart")
+        self.mount_a.run_shell(['mkdir', '-p', testdir[1:]])
+        self.fs_snap_schedule_cmd('add', path=testdir, snap_schedule='1M')
+
+        exec_time = time.time()
+        timo_1, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M')
+
+        self.fs_snap_schedule_cmd('activate', path=testdir, snap_schedule='1M')
+
+        # we wait for 10 snaps to be taken
+        wait_time = timo_1 + 10 * 60 + 15
+        time.sleep(wait_time)
+
+        old_stats = self.get_snap_stats(testdir)
+        self.assertTrue(old_stats['fs_count'] == old_stats['db_count'])
+        self.assertTrue(old_stats['fs_count'] > 9)
+
+        # restart mgr
+        active_mgr = self.mgr_cluster.mon_manager.get_mgr_dump()['active_name']
+        log.debug(f'restarting active mgr: {active_mgr}')
+        self.mgr_cluster.mon_manager.revive_mgr(active_mgr)
+        time.sleep(300)  # sleep for 5 minutes
+        self.fs_snap_schedule_cmd('deactivate', path=testdir, snap_schedule='1M')
+
+        new_stats = self.get_snap_stats(testdir)
+        self.assertTrue(new_stats['fs_count'] == new_stats['db_count'])
+        self.assertTrue(new_stats['fs_count'] > old_stats['fs_count'])
+        self.assertTrue(new_stats['db_count'] > old_stats['db_count'])
+
+        # cleanup
+        self.fs_snap_schedule_cmd('remove', path=testdir, snap_schedule='1M')
+        self.remove_snapshots(testdir[1:])
+        self.mount_a.run_shell(['rmdir', testdir[1:]])
diff --git a/qa/tasks/cephfs/test_snapshots.py b/qa/tasks/cephfs/test_snapshots.py
new file mode 100644
index 000000000..306c80ce3
--- /dev/null
+++ b/qa/tasks/cephfs/test_snapshots.py
@@ -0,0 +1,539 @@
+import errno
+import logging
+import signal
+from textwrap import dedent
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.orchestra.run import CommandFailedError, Raw
+
+log = logging.getLogger(__name__)
+
+MDS_RESTART_GRACE = 60
+
+class TestSnapshots(CephFSTestCase):
+    MDSS_REQUIRED = 3
+    LOAD_SETTINGS = ["mds_max_snaps_per_dir"]
+
+    def _check_subtree(self, rank, path, status=None):
+        got_subtrees = self.fs.rank_asok(["get", "subtrees"], rank=rank, status=status)
+        for s in got_subtrees:
+            if s['dir']['path'] == path and s['auth_first'] == rank:
+                return True
+        return False
+
+    def _get_snapclient_dump(self, rank=0, status=None):
+        return self.fs.rank_asok(["dump", "snaps"], rank=rank, status=status)
+
+    def _get_snapserver_dump(self, rank=0, status=None):
+        return self.fs.rank_asok(["dump", "snaps", "--server"], rank=rank, status=status)
+
+    def _get_last_created_snap(self, rank=0, status=None):
+        return int(self._get_snapserver_dump(rank,status=status)["last_created"])
+
+    def _get_last_destroyed_snap(self, rank=0, status=None):
+        return int(self._get_snapserver_dump(rank,status=status)["last_destroyed"])
+
+    def _get_pending_snap_update(self, rank=0, status=None):
+        return self._get_snapserver_dump(rank,status=status)["pending_update"]
+
+    def _get_pending_snap_destroy(self, rank=0, status=None):
+        return self._get_snapserver_dump(rank,status=status)["pending_destroy"]
+
+    def test_allow_new_snaps_config(self):
+        """
+        Check whether 'allow_new_snaps' setting works
+        """
+        self.mount_a.run_shell(["mkdir", "test-allow-snaps"])
+
+        self.fs.set_allow_new_snaps(False);
+        try:
+            self.mount_a.run_shell(["mkdir", "test-allow-snaps/.snap/snap00"])
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EPERM, "expected EPERM")
+        else:
+            self.fail("expected snap creatiion to fail")
+
+        self.fs.set_allow_new_snaps(True);
+        self.mount_a.run_shell(["mkdir", "test-allow-snaps/.snap/snap00"])
+        self.mount_a.run_shell(["rmdir", "test-allow-snaps/.snap/snap00"])
+        self.mount_a.run_shell(["rmdir", "test-allow-snaps"])
+
+    def test_kill_mdstable(self):
+        """
+        check snaptable transcation
+        """
+        if not isinstance(self.mount_a, FuseMount):
+            self.skipTest("Require FUSE client to forcibly kill mount")
+
+        self.fs.set_allow_new_snaps(True);
+        self.fs.set_max_mds(2)
+        status = self.fs.wait_for_daemons()
+
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+        # setup subtrees
+        self.mount_a.run_shell(["mkdir", "-p", "d1/dir"])
+        self.mount_a.setfattr("d1", "ceph.dir.pin", "1")
+        self._wait_subtrees([("/d1", 1)], rank=1, path="/d1")
+
+        last_created = self._get_last_created_snap(rank=0,status=status)
+
+        # mds_kill_mdstable_at:
+        #  1: MDSTableServer::handle_prepare
+        #  2: MDSTableServer::_prepare_logged
+        #  5: MDSTableServer::handle_commit
+        #  6: MDSTableServer::_commit_logged
+        for i in [1,2,5,6]:
+            log.info("testing snapserver mds_kill_mdstable_at={0}".format(i))
+
+            status = self.fs.status()
+            rank0 = self.fs.get_rank(rank=0, status=status)
+            self.fs.rank_freeze(True, rank=0)
+            self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status)
+            proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s1{0}".format(i)], wait=False)
+            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2);
+            self.delete_mds_coredump(rank0['name']);
+
+            self.fs.rank_fail(rank=0)
+            self.fs.mds_restart(rank0['name'])
+            self.wait_for_daemon_start([rank0['name']])
+            status = self.fs.wait_for_daemons()
+
+            proc.wait()
+            last_created += 1
+            self.wait_until_true(lambda: self._get_last_created_snap(rank=0) == last_created, timeout=30)
+
+        self.set_conf("mds", "mds_reconnect_timeout", "5")
+
+        self.mount_a.run_shell(["rmdir", Raw("d1/dir/.snap/*")])
+
+        # set mds_kill_mdstable_at, also kill snapclient
+        for i in [2,5,6]:
+            log.info("testing snapserver mds_kill_mdstable_at={0}, also kill snapclient".format(i))
+            status = self.fs.status()
+            last_created = self._get_last_created_snap(rank=0, status=status)
+
+            rank0 = self.fs.get_rank(rank=0, status=status)
+            rank1 = self.fs.get_rank(rank=1, status=status)
+            self.fs.rank_freeze(True, rank=0) # prevent failover...
+            self.fs.rank_freeze(True, rank=1) # prevent failover...
+            self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status)
+            proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s2{0}".format(i)], wait=False)
+            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*3);
+            self.delete_mds_coredump(rank0['name']);
+
+            self.fs.rank_signal(signal.SIGKILL, rank=1)
+
+            self.mount_a.kill()
+            self.mount_a.kill_cleanup()
+
+            self.fs.rank_fail(rank=0)
+            self.fs.mds_restart(rank0['name'])
+            self.wait_for_daemon_start([rank0['name']])
+
+            self.fs.wait_for_state('up:resolve', rank=0, timeout=MDS_RESTART_GRACE)
+            if i in [2,5]:
+                self.assertEqual(len(self._get_pending_snap_update(rank=0)), 1)
+            elif i == 6:
+                self.assertEqual(len(self._get_pending_snap_update(rank=0)), 0)
+                self.assertGreater(self._get_last_created_snap(rank=0), last_created)
+
+            self.fs.rank_fail(rank=1)
+            self.fs.mds_restart(rank1['name'])
+            self.wait_for_daemon_start([rank1['name']])
+            self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE)
+
+            if i in [2,5]:
+                self.wait_until_true(lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30)
+                if i == 2:
+                    self.assertEqual(self._get_last_created_snap(rank=0), last_created)
+                else:
+                    self.assertGreater(self._get_last_created_snap(rank=0), last_created)
+
+            self.mount_a.mount_wait()
+
+        self.mount_a.run_shell(["rmdir", Raw("d1/dir/.snap/*")])
+
+        # mds_kill_mdstable_at:
+        #  3: MDSTableClient::handle_request (got agree)
+        #  4: MDSTableClient::commit
+        #  7: MDSTableClient::handle_request (got ack)
+        for i in [3,4,7]:
+            log.info("testing snapclient mds_kill_mdstable_at={0}".format(i))
+            last_created = self._get_last_created_snap(rank=0)
+
+            status = self.fs.status()
+            rank1 = self.fs.get_rank(rank=1, status=status)
+            self.fs.rank_freeze(True, rank=1) # prevent failover...
+            self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=1, status=status)
+            proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s3{0}".format(i)], wait=False)
+            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace*2);
+            self.delete_mds_coredump(rank1['name']);
+
+            self.mount_a.kill()
+            self.mount_a.kill_cleanup()
+
+            if i in [3,4]:
+                self.assertEqual(len(self._get_pending_snap_update(rank=0)), 1)
+            elif i == 7:
+                self.assertEqual(len(self._get_pending_snap_update(rank=0)), 0)
+                self.assertGreater(self._get_last_created_snap(rank=0), last_created)
+
+            self.fs.rank_fail(rank=1)
+            self.fs.mds_restart(rank1['name'])
+            self.wait_for_daemon_start([rank1['name']])
+            status = self.fs.wait_for_daemons(timeout=MDS_RESTART_GRACE)
+
+            if i in [3,4]:
+                self.wait_until_true(lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30)
+                if i == 3:
+                    self.assertEqual(self._get_last_created_snap(rank=0), last_created)
+                else:
+                    self.assertGreater(self._get_last_created_snap(rank=0), last_created)
+
+            self.mount_a.mount_wait()
+
+        self.mount_a.run_shell(["rmdir", Raw("d1/dir/.snap/*")])
+
+        # mds_kill_mdstable_at:
+        #  3: MDSTableClient::handle_request (got agree)
+        #  8: MDSTableServer::handle_rollback
+        log.info("testing snapclient mds_kill_mdstable_at=3, snapserver mds_kill_mdstable_at=8")
+        last_created = self._get_last_created_snap(rank=0)
+
+        status = self.fs.status()
+        rank0 = self.fs.get_rank(rank=0, status=status)
+        rank1 = self.fs.get_rank(rank=1, status=status)
+        self.fs.rank_freeze(True, rank=0)
+        self.fs.rank_freeze(True, rank=1)
+        self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "8"], rank=0, status=status)
+        self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "3"], rank=1, status=status)
+        proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s4"], wait=False)
+        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace*2);
+        self.delete_mds_coredump(rank1['name']);
+
+        self.mount_a.kill()
+        self.mount_a.kill_cleanup()
+
+        self.assertEqual(len(self._get_pending_snap_update(rank=0)), 1)
+
+        self.fs.rank_fail(rank=1)
+        self.fs.mds_restart(rank1['name'])
+        self.wait_for_daemon_start([rank1['name']])
+
+        # rollback triggers assertion
+        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2);
+        self.delete_mds_coredump(rank0['name']);
+        self.fs.rank_fail(rank=0)
+        self.fs.mds_restart(rank0['name'])
+        self.wait_for_daemon_start([rank0['name']])
+        self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE)
+
+        # mds.1 should re-send rollback message
+        self.wait_until_true(lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30)
+        self.assertEqual(self._get_last_created_snap(rank=0), last_created)
+
+        self.mount_a.mount_wait()
+
+    def test_snapclient_cache(self):
+        """
+        check if snapclient cache gets synced properly
+        """
+        self.fs.set_allow_new_snaps(True);
+        self.fs.set_max_mds(3)
+        status = self.fs.wait_for_daemons()
+
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+        self.mount_a.run_shell(["mkdir", "-p", "d0/d1/dir"])
+        self.mount_a.run_shell(["mkdir", "-p", "d0/d2/dir"])
+        self.mount_a.setfattr("d0", "ceph.dir.pin", "0")
+        self.mount_a.setfattr("d0/d1", "ceph.dir.pin", "1")
+        self.mount_a.setfattr("d0/d2", "ceph.dir.pin", "2")
+        self._wait_subtrees([("/d0", 0), ("/d0/d1", 1), ("/d0/d2", 2)], rank="all", status=status, path="/d0")
+
+        def _check_snapclient_cache(snaps_dump, cache_dump=None, rank=0):
+            if cache_dump is None:
+                cache_dump = self._get_snapclient_dump(rank=rank)
+            for key, value in cache_dump.items():
+                if value != snaps_dump[key]:
+                    return False
+            return True;
+
+        # sync after mksnap
+        last_created = self._get_last_created_snap(rank=0)
+        self.mount_a.run_shell(["mkdir", "d0/d1/dir/.snap/s1", "d0/d1/dir/.snap/s2"])
+        self.wait_until_true(lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30)
+        self.assertGreater(self._get_last_created_snap(rank=0), last_created)
+
+        snaps_dump = self._get_snapserver_dump(rank=0)
+        self.assertTrue(_check_snapclient_cache(snaps_dump, rank=0));
+        self.assertTrue(_check_snapclient_cache(snaps_dump, rank=1));
+        self.assertTrue(_check_snapclient_cache(snaps_dump, rank=2));
+
+        # sync after rmsnap
+        last_destroyed = self._get_last_destroyed_snap(rank=0)
+        self.mount_a.run_shell(["rmdir", "d0/d1/dir/.snap/s1"])
+        self.wait_until_true(lambda: len(self._get_pending_snap_destroy(rank=0)) == 0, timeout=30)
+        self.assertGreater(self._get_last_destroyed_snap(rank=0), last_destroyed)
+
+        snaps_dump = self._get_snapserver_dump(rank=0)
+        self.assertTrue(_check_snapclient_cache(snaps_dump, rank=0));
+        self.assertTrue(_check_snapclient_cache(snaps_dump, rank=1));
+        self.assertTrue(_check_snapclient_cache(snaps_dump, rank=2));
+
+        # sync during mds recovers
+        self.fs.rank_fail(rank=2)
+        status = self.fs.wait_for_daemons(timeout=MDS_RESTART_GRACE)
+        self.assertTrue(_check_snapclient_cache(snaps_dump, rank=2));
+
+        self.fs.rank_fail(rank=0)
+        self.fs.rank_fail(rank=1)
+        status = self.fs.wait_for_daemons()
+        self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE)
+        self.assertTrue(_check_snapclient_cache(snaps_dump, rank=0));
+        self.assertTrue(_check_snapclient_cache(snaps_dump, rank=1));
+        self.assertTrue(_check_snapclient_cache(snaps_dump, rank=2));
+
+        # kill at MDSTableClient::handle_notify_prep
+        status = self.fs.status()
+        rank2 = self.fs.get_rank(rank=2, status=status)
+        self.fs.rank_freeze(True, rank=2)
+        self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "9"], rank=2, status=status)
+        proc = self.mount_a.run_shell(["mkdir", "d0/d1/dir/.snap/s3"], wait=False)
+        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace*2);
+        self.delete_mds_coredump(rank2['name']);
+
+        # mksnap should wait for notify ack from mds.2
+        self.assertFalse(proc.finished);
+
+        # mksnap should proceed after mds.2 fails
+        self.fs.rank_fail(rank=2)
+        self.wait_until_true(lambda: proc.finished, timeout=30);
+
+        self.fs.mds_restart(rank2['name'])
+        self.wait_for_daemon_start([rank2['name']])
+        status = self.fs.wait_for_daemons(timeout=MDS_RESTART_GRACE)
+
+        self.mount_a.run_shell(["rmdir", Raw("d0/d1/dir/.snap/*")])
+
+        # kill at MDSTableClient::commit
+        # the recovering mds should sync all mds' cache when it enters resolve stage
+        self.set_conf("mds", "mds_reconnect_timeout", "5")
+        for i in range(1, 4):
+            status = self.fs.status()
+            rank2 = self.fs.get_rank(rank=2, status=status)
+            self.fs.rank_freeze(True, rank=2)
+            self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "4"], rank=2, status=status)
+            last_created = self._get_last_created_snap(rank=0)
+            proc = self.mount_a.run_shell(["mkdir", "d0/d2/dir/.snap/s{0}".format(i)], wait=False)
+            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace*2);
+            self.delete_mds_coredump(rank2['name']);
+
+            self.mount_a.kill()
+            self.mount_a.kill_cleanup()
+
+            self.assertEqual(len(self._get_pending_snap_update(rank=0)), 1)
+
+            if i in [2,4]:
+                self.fs.rank_fail(rank=0)
+            if i in [3,4]:
+                self.fs.rank_fail(rank=1)
+
+            self.fs.rank_fail(rank=2)
+            self.fs.mds_restart(rank2['name'])
+            self.wait_for_daemon_start([rank2['name']])
+            status = self.fs.wait_for_daemons(timeout=MDS_RESTART_GRACE)
+
+            rank0_cache = self._get_snapclient_dump(rank=0)
+            rank1_cache = self._get_snapclient_dump(rank=1)
+            rank2_cache = self._get_snapclient_dump(rank=2)
+
+            self.assertGreater(int(rank0_cache["last_created"]), last_created)
+            self.assertEqual(rank0_cache, rank1_cache);
+            self.assertEqual(rank0_cache, rank2_cache);
+
+            self.wait_until_true(lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30)
+
+            snaps_dump = self._get_snapserver_dump(rank=0)
+            self.assertEqual(snaps_dump["last_created"], rank0_cache["last_created"])
+            self.assertTrue(_check_snapclient_cache(snaps_dump, cache_dump=rank0_cache));
+
+            self.mount_a.mount_wait()
+
+        self.mount_a.run_shell(["rmdir", Raw("d0/d2/dir/.snap/*")])
+
+    def test_multimds_mksnap(self):
+        """
+        check if snapshot takes effect across authority subtrees
+        """
+        self.fs.set_allow_new_snaps(True);
+        self.fs.set_max_mds(2)
+        status = self.fs.wait_for_daemons()
+
+        self.mount_a.run_shell(["mkdir", "-p", "d0/d1/empty"])
+        self.mount_a.setfattr("d0", "ceph.dir.pin", "0")
+        self.mount_a.setfattr("d0/d1", "ceph.dir.pin", "1")
+        self._wait_subtrees([("/d0", 0), ("/d0/d1", 1)], rank="all", status=status, path="/d0")
+
+        self.mount_a.write_test_pattern("d0/d1/file_a", 8 * 1024 * 1024)
+        self.mount_a.run_shell(["mkdir", "d0/.snap/s1"])
+        self.mount_a.run_shell(["rm", "-f", "d0/d1/file_a"])
+        self.mount_a.validate_test_pattern("d0/.snap/s1/d1/file_a", 8 * 1024 * 1024)
+
+        self.mount_a.run_shell(["rmdir", "d0/.snap/s1"])
+        self.mount_a.run_shell(["rm", "-rf", "d0"])
+
+    def test_multimds_past_parents(self):
+        """
+        check if past parents are properly recorded during across authority rename
+        """
+        self.fs.set_allow_new_snaps(True);
+        self.fs.set_max_mds(2)
+        status = self.fs.wait_for_daemons()
+
+        self.mount_a.run_shell_payload("mkdir -p {d0,d1}/empty")
+        self.mount_a.setfattr("d0", "ceph.dir.pin", "0")
+        self.mount_a.setfattr("d1", "ceph.dir.pin", "1")
+        self._wait_subtrees([("/d0", 0), ("/d1", 1)], rank=0, status=status)
+
+        self.mount_a.run_shell(["mkdir", "d0/d3"])
+        self.mount_a.run_shell(["mkdir", "d0/.snap/s1"])
+        snap_name = self.mount_a.run_shell(["ls", "d0/d3/.snap"]).stdout.getvalue()
+
+        self.mount_a.run_shell(["mv", "d0/d3", "d1/d3"])
+        snap_name1 = self.mount_a.run_shell(["ls", "d1/d3/.snap"]).stdout.getvalue()
+        self.assertEqual(snap_name1, snap_name);
+
+        self.mount_a.run_shell(["rmdir", "d0/.snap/s1"])
+        snap_name1 = self.mount_a.run_shell(["ls", "d1/d3/.snap"]).stdout.getvalue()
+        self.assertEqual(snap_name1, "");
+
+        self.mount_a.run_shell(["rm", "-rf", "d0", "d1"])
+
+    def test_multimds_hardlink(self):
+        """
+        check if hardlink snapshot works in multimds setup
+        """
+        self.fs.set_allow_new_snaps(True);
+        self.fs.set_max_mds(2)
+        status = self.fs.wait_for_daemons()
+
+        self.mount_a.run_shell_payload("mkdir -p {d0,d1}/empty")
+
+        self.mount_a.setfattr("d0", "ceph.dir.pin", "0")
+        self.mount_a.setfattr("d1", "ceph.dir.pin", "1")
+        self._wait_subtrees([("/d0", 0), ("/d1", 1)], rank=0, status=status)
+
+        self.mount_a.run_python(dedent("""
+            import os
+            open(os.path.join("{path}", "d0/file1"), 'w').write("asdf")
+            open(os.path.join("{path}", "d0/file2"), 'w').write("asdf")
+            """.format(path=self.mount_a.mountpoint)
+        ))
+
+        self.mount_a.run_shell(["ln", "d0/file1", "d1/file1"])
+        self.mount_a.run_shell(["ln", "d0/file2", "d1/file2"])
+
+        self.mount_a.run_shell(["mkdir", "d1/.snap/s1"])
+
+        self.mount_a.run_python(dedent("""
+            import os
+            open(os.path.join("{path}", "d0/file1"), 'w').write("qwer")
+            """.format(path=self.mount_a.mountpoint)
+        ))
+
+        self.mount_a.run_shell(["grep", "asdf", "d1/.snap/s1/file1"])
+
+        self.mount_a.run_shell(["rm", "-f", "d0/file2"])
+        self.mount_a.run_shell(["grep", "asdf", "d1/.snap/s1/file2"])
+
+        self.mount_a.run_shell(["rm", "-f", "d1/file2"])
+        self.mount_a.run_shell(["grep", "asdf", "d1/.snap/s1/file2"])
+
+        self.mount_a.run_shell(["rmdir", "d1/.snap/s1"])
+        self.mount_a.run_shell(["rm", "-rf", "d0", "d1"])
+
+    class SnapLimitViolationException(Exception):
+        failed_snapshot_number = -1
+
+        def __init__(self, num):
+            self.failed_snapshot_number = num
+
+    def get_snap_name(self, dir_name, sno):
+            sname = "{dir_name}/.snap/s_{sno}".format(dir_name=dir_name, sno=sno)
+            return sname
+
+    def create_snap_dir(self, sname):
+        self.mount_a.run_shell(["mkdir", sname])
+
+    def delete_dir_and_snaps(self, dir_name, snaps):
+        for sno in range(1, snaps+1, 1):
+            sname = self.get_snap_name(dir_name, sno)
+            self.mount_a.run_shell(["rmdir", sname])
+        self.mount_a.run_shell(["rmdir", dir_name])
+
+    def create_dir_and_snaps(self, dir_name, snaps):
+        self.mount_a.run_shell(["mkdir", dir_name])
+
+        for sno in range(1, snaps+1, 1):
+            sname = self.get_snap_name(dir_name, sno)
+            try:
+                self.create_snap_dir(sname)
+            except CommandFailedError as e:
+                # failing at the last mkdir beyond the limit is expected
+                if sno == snaps:
+                    log.info("failed while creating snap #{}: {}".format(sno, repr(e)))
+                    raise TestSnapshots.SnapLimitViolationException(sno)
+
+    def test_mds_max_snaps_per_dir_default_limit(self):
+        """
+        Test the newly introudced option named mds_max_snaps_per_dir
+        Default snaps limit is 100
+        Test if the default number of snapshot directories can be created
+        """
+        self.create_dir_and_snaps("accounts", int(self.mds_max_snaps_per_dir))
+        self.delete_dir_and_snaps("accounts", int(self.mds_max_snaps_per_dir))
+
+    def test_mds_max_snaps_per_dir_with_increased_limit(self):
+        """
+        Test the newly introudced option named mds_max_snaps_per_dir
+        First create 101 directories and ensure that the 101st directory
+        creation fails. Then increase the default by one and see if the
+        additional directory creation succeeds
+        """
+        # first test the default limit
+        new_limit = int(self.mds_max_snaps_per_dir)
+        self.fs.rank_asok(['config', 'set', 'mds_max_snaps_per_dir', repr(new_limit)])
+        try:
+            self.create_dir_and_snaps("accounts", new_limit + 1)
+        except TestSnapshots.SnapLimitViolationException as e:
+            if e.failed_snapshot_number == (new_limit + 1):
+                pass
+        # then increase the limit by one and test
+        new_limit = new_limit + 1
+        self.fs.rank_asok(['config', 'set', 'mds_max_snaps_per_dir', repr(new_limit)])
+        sname = self.get_snap_name("accounts", new_limit)
+        self.create_snap_dir(sname)
+        self.delete_dir_and_snaps("accounts", new_limit)
+
+    def test_mds_max_snaps_per_dir_with_reduced_limit(self):
+        """
+        Test the newly introudced option named mds_max_snaps_per_dir
+        First create 99 directories. Then reduce the limit to 98. Then try
+        creating another directory and ensure that additional directory
+        creation fails.
+        """
+        # first test the new limit
+        new_limit = int(self.mds_max_snaps_per_dir) - 1
+        self.create_dir_and_snaps("accounts", new_limit)
+        sname = self.get_snap_name("accounts", new_limit + 1)
+        # then reduce the limit by one and test
+        new_limit = new_limit - 1
+        self.fs.rank_asok(['config', 'set', 'mds_max_snaps_per_dir', repr(new_limit)])
+        try:
+            self.create_snap_dir(sname)
+        except CommandFailedError:
+            # after reducing limit we expect the new snapshot creation to fail
+            pass
+        self.delete_dir_and_snaps("accounts", new_limit + 1)
diff --git a/qa/tasks/cephfs/test_strays.py b/qa/tasks/cephfs/test_strays.py
new file mode 100644
index 000000000..9dd71c7bf
--- /dev/null
+++ b/qa/tasks/cephfs/test_strays.py
@@ -0,0 +1,1026 @@
+import json
+import time
+import logging
+from textwrap import dedent
+import datetime
+import gevent
+
+from teuthology.orchestra.run import CommandFailedError, Raw
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+
+log = logging.getLogger(__name__)
+
+
+class TestStrays(CephFSTestCase):
+    MDSS_REQUIRED = 2
+
+    OPS_THROTTLE = 1
+    FILES_THROTTLE = 2
+
+    # Range of different file sizes used in throttle test's workload
+    throttle_workload_size_range = 16
+
+    @for_teuthology
+    def test_ops_throttle(self):
+        self._test_throttling(self.OPS_THROTTLE)
+
+    @for_teuthology
+    def test_files_throttle(self):
+        self._test_throttling(self.FILES_THROTTLE)
+
+    def test_dir_deletion(self):
+        """
+        That when deleting a bunch of dentries and the containing
+        directory, everything gets purged.
+        Catches cases where the client might e.g. fail to trim
+        the unlinked dir from its cache.
+        """
+        file_count = 1000
+        create_script = dedent("""
+            import os
+
+            mountpoint = "{mountpoint}"
+            subdir = "delete_me"
+            size = {size}
+            file_count = {file_count}
+            os.mkdir(os.path.join(mountpoint, subdir))
+            for i in range(0, file_count):
+                filename = "{{0}}_{{1}}.bin".format(i, size)
+                with open(os.path.join(mountpoint, subdir, filename), 'w') as f:
+                    f.write(size * 'x')
+        """.format(
+            mountpoint=self.mount_a.mountpoint,
+            size=1024,
+            file_count=file_count
+        ))
+
+        self.mount_a.run_python(create_script)
+
+        # That the dirfrag object is created
+        self.fs.mds_asok(["flush", "journal"])
+        dir_ino = self.mount_a.path_to_ino("delete_me")
+        self.assertTrue(self.fs.dirfrag_exists(dir_ino, 0))
+
+        # Remove everything
+        self.mount_a.run_shell(["rm", "-rf", "delete_me"])
+        self.fs.mds_asok(["flush", "journal"])
+
+        # That all the removed files get created as strays
+        strays = self.get_mdc_stat("strays_created")
+        self.assertEqual(strays, file_count + 1)
+
+        # That the strays all get enqueued for purge
+        self.wait_until_equal(
+            lambda: self.get_mdc_stat("strays_enqueued"),
+            strays,
+            timeout=600
+
+        )
+
+        # That all the purge operations execute
+        self.wait_until_equal(
+            lambda: self.get_stat("purge_queue", "pq_executed"),
+            strays,
+            timeout=600
+        )
+
+        # That finally, the directory metadata object is gone
+        self.assertFalse(self.fs.dirfrag_exists(dir_ino, 0))
+
+        # That finally, the data objects are all gone
+        self.await_data_pool_empty()
+
+    def _test_throttling(self, throttle_type):
+        self.data_log = []
+        try:
+            return self._do_test_throttling(throttle_type)
+        except:
+            for l in self.data_log:
+                log.info(",".join([l_.__str__() for l_ in l]))
+            raise
+
+    def _do_test_throttling(self, throttle_type):
+        """
+        That the mds_max_purge_ops setting is respected
+        """
+
+        def set_throttles(files, ops):
+            """
+            Helper for updating ops/files limits, and calculating effective
+            ops_per_pg setting to give the same ops limit.
+            """
+            self.set_conf('mds', 'mds_max_purge_files', "%d" % files)
+            self.set_conf('mds', 'mds_max_purge_ops', "%d" % ops)
+
+            pgs = self.fs.mon_manager.get_pool_int_property(
+                self.fs.get_data_pool_name(),
+                "pg_num"
+            )
+            ops_per_pg = float(ops) / pgs
+            self.set_conf('mds', 'mds_max_purge_ops_per_pg', "%s" % ops_per_pg)
+
+        # Test conditions depend on what we're going to be exercising.
+        # * Lift the threshold on whatever throttle we are *not* testing, so
+        #   that the throttle of interest is the one that will be the bottleneck
+        # * Create either many small files (test file count throttling) or fewer
+        #   large files (test op throttling)
+        if throttle_type == self.OPS_THROTTLE:
+            set_throttles(files=100000000, ops=16)
+            size_unit = 1024 * 1024  # big files, generate lots of ops
+            file_multiplier = 100
+        elif throttle_type == self.FILES_THROTTLE:
+            # The default value of file limit is pretty permissive, so to avoid
+            # the test running too fast, create lots of files and set the limit
+            # pretty low.
+            set_throttles(ops=100000000, files=6)
+            size_unit = 1024  # small, numerous files
+            file_multiplier = 200
+        else:
+            raise NotImplementedError(throttle_type)
+
+        # Pick up config changes
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        create_script = dedent("""
+            import os
+
+            mountpoint = "{mountpoint}"
+            subdir = "delete_me"
+            size_unit = {size_unit}
+            file_multiplier = {file_multiplier}
+            os.mkdir(os.path.join(mountpoint, subdir))
+            for i in range(0, file_multiplier):
+                for size in range(0, {size_range}*size_unit, size_unit):
+                    filename = "{{0}}_{{1}}.bin".format(i, size // size_unit)
+                    with open(os.path.join(mountpoint, subdir, filename), 'w') as f:
+                        f.write(size * 'x')
+        """.format(
+            mountpoint=self.mount_a.mountpoint,
+            size_unit=size_unit,
+            file_multiplier=file_multiplier,
+            size_range=self.throttle_workload_size_range
+        ))
+
+        self.mount_a.run_python(create_script)
+
+        # We will run the deletion in the background, to reduce the risk of it completing before
+        # we have started monitoring the stray statistics.
+        def background():
+            self.mount_a.run_shell(["rm", "-rf", "delete_me"])
+            self.fs.mds_asok(["flush", "journal"])
+
+        background_thread = gevent.spawn(background)
+
+        total_inodes = file_multiplier * self.throttle_workload_size_range + 1
+        mds_max_purge_ops = int(self.fs.get_config("mds_max_purge_ops", 'mds'))
+        mds_max_purge_files = int(self.fs.get_config("mds_max_purge_files", 'mds'))
+
+        # During this phase we look for the concurrent ops to exceed half
+        # the limit (a heuristic) and not exceed the limit (a correctness
+        # condition).
+        purge_timeout = 600
+        elapsed = 0
+        files_high_water = 0
+        ops_high_water = 0
+
+        while True:
+            stats = self.fs.mds_asok(['perf', 'dump'])
+            mdc_stats = stats['mds_cache']
+            pq_stats = stats['purge_queue']
+            if elapsed >= purge_timeout:
+                raise RuntimeError("Timeout waiting for {0} inodes to purge, stats:{1}".format(total_inodes, mdc_stats))
+
+            num_strays = mdc_stats['num_strays']
+            num_strays_purging = pq_stats['pq_executing']
+            num_purge_ops = pq_stats['pq_executing_ops']
+            files_high_water = pq_stats['pq_executing_high_water']
+            ops_high_water = pq_stats['pq_executing_ops_high_water']
+
+            self.data_log.append([datetime.datetime.now(), num_strays, num_strays_purging, num_purge_ops, files_high_water, ops_high_water])
+
+            total_strays_created = mdc_stats['strays_created']
+            total_strays_purged = pq_stats['pq_executed']
+
+            if total_strays_purged == total_inodes:
+                log.info("Complete purge in {0} seconds".format(elapsed))
+                break
+            elif total_strays_purged > total_inodes:
+                raise RuntimeError("Saw more strays than expected, mdc stats: {0}".format(mdc_stats))
+            else:
+                if throttle_type == self.OPS_THROTTLE:
+                    # 11 is filer_max_purge_ops plus one for the backtrace:
+                    # limit is allowed to be overshot by this much.
+                    if num_purge_ops > mds_max_purge_ops + 11:
+                        raise RuntimeError("num_purge_ops violates threshold {0}/{1}".format(
+                            num_purge_ops, mds_max_purge_ops
+                        ))
+                elif throttle_type == self.FILES_THROTTLE:
+                    if num_strays_purging > mds_max_purge_files:
+                        raise RuntimeError("num_strays_purging violates threshold {0}/{1}".format(
+                            num_strays_purging, mds_max_purge_files
+                        ))
+                else:
+                    raise NotImplementedError(throttle_type)
+
+                log.info("Waiting for purge to complete {0}/{1}, {2}/{3}".format(
+                    num_strays_purging, num_strays,
+                    total_strays_purged, total_strays_created
+                ))
+                time.sleep(1)
+                elapsed += 1
+
+        background_thread.join()
+
+        # Check that we got up to a respectable rate during the purge.  This is totally
+        # racy, but should be safeish unless the cluster is pathologically slow, or
+        # insanely fast such that the deletions all pass before we have polled the
+        # statistics.
+        if throttle_type == self.OPS_THROTTLE:
+            if ops_high_water < mds_max_purge_ops // 2:
+                raise RuntimeError("Ops in flight high water is unexpectedly low ({0} / {1})".format(
+                    ops_high_water, mds_max_purge_ops
+                ))
+            # The MDS may go over mds_max_purge_ops for some items, like a
+            # heavily fragmented directory.  The throttle does not kick in
+            # until *after* we reach or exceed the limit.  This is expected
+            # because we don't want to starve the PQ or never purge a
+            # particularly large file/directory.
+            self.assertLessEqual(ops_high_water, mds_max_purge_ops+64)
+        elif throttle_type == self.FILES_THROTTLE:
+            if files_high_water < mds_max_purge_files // 2:
+                raise RuntimeError("Files in flight high water is unexpectedly low ({0} / {1})".format(
+                    files_high_water, mds_max_purge_files
+                ))
+            self.assertLessEqual(files_high_water, mds_max_purge_files)
+
+        # Sanity check all MDC stray stats
+        stats = self.fs.mds_asok(['perf', 'dump'])
+        mdc_stats = stats['mds_cache']
+        pq_stats = stats['purge_queue']
+        self.assertEqual(mdc_stats['num_strays'], 0)
+        self.assertEqual(mdc_stats['num_strays_delayed'], 0)
+        self.assertEqual(pq_stats['pq_executing'], 0)
+        self.assertEqual(pq_stats['pq_executing_ops'], 0)
+        self.assertEqual(mdc_stats['strays_created'], total_inodes)
+        self.assertEqual(mdc_stats['strays_enqueued'], total_inodes)
+        self.assertEqual(pq_stats['pq_executed'], total_inodes)
+
+    def get_mdc_stat(self, name, mds_id=None):
+        return self.get_stat("mds_cache", name, mds_id)
+
+    def get_stat(self, subsys, name, mds_id=None):
+        return self.fs.mds_asok(['perf', 'dump', subsys, name],
+                                mds_id=mds_id)[subsys][name]
+
+    def _wait_for_counter(self, subsys, counter, expect_val, timeout=60,
+                          mds_id=None):
+        self.wait_until_equal(
+            lambda: self.get_stat(subsys, counter, mds_id),
+            expect_val=expect_val, timeout=timeout,
+            reject_fn=lambda x: x > expect_val
+        )
+
+    def test_open_inode(self):
+        """
+        That the case of a dentry unlinked while a client holds an
+        inode open is handled correctly.
+
+        The inode should be moved into a stray dentry, while the original
+        dentry and directory should be purged.
+
+        The inode's data should be purged when the client eventually closes
+        it.
+        """
+        mount_a_client_id = self.mount_a.get_global_id()
+
+        # Write some bytes to a file
+        size_mb = 8
+
+        # Hold the file open
+        p = self.mount_a.open_background("open_file")
+        self.mount_a.write_n_mb("open_file", size_mb)
+        open_file_ino = self.mount_a.path_to_ino("open_file")
+
+        self.assertEqual(self.get_session(mount_a_client_id)['num_caps'], 2)
+
+        # Unlink the dentry
+        self.mount_a.run_shell(["rm", "-f", "open_file"])
+
+        # Wait to see the stray count increment
+        self.wait_until_equal(
+            lambda: self.get_mdc_stat("num_strays"),
+            expect_val=1, timeout=60, reject_fn=lambda x: x > 1)
+
+        # See that while the stray count has incremented, none have passed
+        # on to the purge queue
+        self.assertEqual(self.get_mdc_stat("strays_created"), 1)
+        self.assertEqual(self.get_mdc_stat("strays_enqueued"), 0)
+
+        # See that the client still holds 2 caps
+        self.assertEqual(self.get_session(mount_a_client_id)['num_caps'], 2)
+
+        # See that the data objects remain in the data pool
+        self.assertTrue(self.fs.data_objects_present(open_file_ino, size_mb * 1024 * 1024))
+
+        # Now close the file
+        self.mount_a.kill_background(p)
+
+        # Wait to see the client cap count decrement
+        self.wait_until_equal(
+            lambda: self.get_session(mount_a_client_id)['num_caps'],
+            expect_val=1, timeout=60, reject_fn=lambda x: x > 2 or x < 1
+        )
+        # Wait to see the purge counter increment, stray count go to zero
+        self._wait_for_counter("mds_cache", "strays_enqueued", 1)
+        self.wait_until_equal(
+            lambda: self.get_mdc_stat("num_strays"),
+            expect_val=0, timeout=6, reject_fn=lambda x: x > 1
+        )
+        self._wait_for_counter("purge_queue", "pq_executed", 1)
+
+        # See that the data objects no longer exist
+        self.assertTrue(self.fs.data_objects_absent(open_file_ino, size_mb * 1024 * 1024))
+
+        self.await_data_pool_empty()
+
+    def test_reintegration_limit(self):
+        """
+        That the reintegration is not blocked by full directories.
+        """
+
+        LOW_LIMIT = 50
+        self.config_set('mds', 'mds_bal_fragment_size_max', str(LOW_LIMIT))
+        time.sleep(10) # for config to reach MDS; async create is fast!!
+
+        last_reintegrated = self.get_mdc_stat("strays_reintegrated")
+        self.mount_a.run_shell_payload("""
+        mkdir a b
+        for i in `seq 1 50`; do
+           touch a/"$i"
+           ln a/"$i" b/"$i"
+        done
+        sync -f a b
+        rm a/*
+        """)
+
+        self.wait_until_equal(
+            lambda: self.get_mdc_stat("num_strays"),
+            expect_val=0,
+            timeout=60
+        )
+        curr_reintegrated = self.get_mdc_stat("strays_reintegrated")
+        self.assertGreater(curr_reintegrated, last_reintegrated)
+
+
+    def test_hardlink_reintegration(self):
+        """
+        That removal of primary dentry of hardlinked inode results
+        in reintegration of inode into the previously-remote dentry,
+        rather than lingering as a stray indefinitely.
+        """
+        # Write some bytes to file_a
+        size_mb = 8
+        self.mount_a.run_shell(["mkdir", "dir_1"])
+        self.mount_a.write_n_mb("dir_1/file_a", size_mb)
+        ino = self.mount_a.path_to_ino("dir_1/file_a")
+
+        # Create a hardlink named file_b
+        self.mount_a.run_shell(["mkdir", "dir_2"])
+        self.mount_a.run_shell(["ln", "dir_1/file_a", "dir_2/file_b"])
+        self.assertEqual(self.mount_a.path_to_ino("dir_2/file_b"), ino)
+
+        # Flush journal
+        self.fs.mds_asok(['flush', 'journal'])
+
+        # See that backtrace for the file points to the file_a path
+        pre_unlink_bt = self.fs.read_backtrace(ino)
+        self.assertEqual(pre_unlink_bt['ancestors'][0]['dname'], "file_a")
+
+        # empty mds cache. otherwise mds reintegrates stray when unlink finishes
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(['flush', 'journal'])
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+        self.mount_a.mount_wait()
+
+        # Unlink file_a
+        self.mount_a.run_shell(["rm", "-f", "dir_1/file_a"])
+
+        # See that a stray was created
+        self.assertEqual(self.get_mdc_stat("num_strays"), 1)
+        self.assertEqual(self.get_mdc_stat("strays_created"), 1)
+
+        # Wait, see that data objects are still present (i.e. that the
+        # stray did not advance to purging given time)
+        time.sleep(30)
+        self.assertTrue(self.fs.data_objects_present(ino, size_mb * 1024 * 1024))
+        self.assertEqual(self.get_mdc_stat("strays_enqueued"), 0)
+
+        # See that before reintegration, the inode's backtrace points to a stray dir
+        self.fs.mds_asok(['flush', 'journal'])
+        self.assertTrue(self.get_backtrace_path(ino).startswith("stray"))
+
+        last_reintegrated = self.get_mdc_stat("strays_reintegrated")
+
+        # Do a metadata operation on the remaining link (mv is heavy handed, but
+        # others like touch may be satisfied from caps without poking MDS)
+        self.mount_a.run_shell(["mv", "dir_2/file_b", "dir_2/file_c"])
+
+        # Stray reintegration should happen as a result of the eval_remote call
+        # on responding to a client request.
+        self.wait_until_equal(
+            lambda: self.get_mdc_stat("num_strays"),
+            expect_val=0,
+            timeout=60
+        )
+
+        # See the reintegration counter increment
+        curr_reintegrated = self.get_mdc_stat("strays_reintegrated")
+        self.assertGreater(curr_reintegrated, last_reintegrated)
+        last_reintegrated = curr_reintegrated
+
+        # Flush the journal
+        self.fs.mds_asok(['flush', 'journal'])
+
+        # See that the backtrace for the file points to the remaining link's path
+        post_reint_bt = self.fs.read_backtrace(ino)
+        self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "file_c")
+
+        # mds should reintegrates stray when unlink finishes
+        self.mount_a.run_shell(["ln", "dir_2/file_c", "dir_2/file_d"])
+        self.mount_a.run_shell(["rm", "-f", "dir_2/file_c"])
+
+        # Stray reintegration should happen as a result of the notify_stray call
+        # on completion of unlink
+        self.wait_until_equal(
+            lambda: self.get_mdc_stat("num_strays"),
+            expect_val=0,
+            timeout=60
+        )
+
+        # See the reintegration counter increment
+        curr_reintegrated = self.get_mdc_stat("strays_reintegrated")
+        self.assertGreater(curr_reintegrated, last_reintegrated)
+        last_reintegrated = curr_reintegrated
+
+        # Flush the journal
+        self.fs.mds_asok(['flush', 'journal'])
+
+        # See that the backtrace for the file points to the newest link's path
+        post_reint_bt = self.fs.read_backtrace(ino)
+        self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "file_d")
+
+        # Now really delete it
+        self.mount_a.run_shell(["rm", "-f", "dir_2/file_d"])
+        self._wait_for_counter("mds_cache", "strays_enqueued", 1)
+        self._wait_for_counter("purge_queue", "pq_executed", 1)
+
+        self.assert_purge_idle()
+        self.assertTrue(self.fs.data_objects_absent(ino, size_mb * 1024 * 1024))
+
+        # We caused the inode to go stray 3 times
+        self.assertEqual(self.get_mdc_stat("strays_created"), 3)
+        # We purged it at the last
+        self.assertEqual(self.get_mdc_stat("strays_enqueued"), 1)
+
+    def test_reintegration_via_scrub(self):
+        """
+        That reintegration is triggered via recursive scrub.
+        """
+
+        self.mount_a.run_shell_payload("""
+        mkdir -p a b
+        for i in `seq 1 50`; do
+           touch a/"$i"
+           ln a/"$i" b/"$i"
+        done
+        sync -f .
+        """)
+
+        self.mount_a.remount() # drop caps/cache
+        self.fs.rank_tell(["flush", "journal"])
+        self.fs.rank_fail()
+        self.fs.wait_for_daemons()
+
+        # only / in cache, reintegration cannot happen
+        self.wait_until_equal(
+            lambda: len(self.fs.rank_tell(["dump", "tree", "/"])),
+            expect_val=3,
+            timeout=60
+        )
+
+        last_reintegrated = self.get_mdc_stat("strays_reintegrated")
+        self.mount_a.run_shell_payload("""
+        rm a/*
+        sync -f .
+        """)
+        self.wait_until_equal(
+            lambda: len(self.fs.rank_tell(["dump", "tree", "/"])),
+            expect_val=3,
+            timeout=60
+        )
+        self.assertEqual(self.get_mdc_stat("num_strays"), 50)
+        curr_reintegrated = self.get_mdc_stat("strays_reintegrated")
+        self.assertEqual(last_reintegrated, curr_reintegrated)
+
+        self.fs.rank_tell(["scrub", "start", "/", "recursive,force"])
+
+        self.wait_until_equal(
+            lambda: self.get_mdc_stat("num_strays"),
+            expect_val=0,
+            timeout=60
+        )
+        curr_reintegrated = self.get_mdc_stat("strays_reintegrated")
+        # N.B.: reintegrate (rename RPC) may be tried multiple times from different code paths
+        self.assertGreaterEqual(curr_reintegrated, last_reintegrated+50)
+
+    def test_mv_hardlink_cleanup(self):
+        """
+        That when doing a rename from A to B, and B has hardlinks,
+        then we make a stray for B which is then reintegrated
+        into one of his hardlinks.
+        """
+        # Create file_a, file_b, and a hardlink to file_b
+        size_mb = 8
+        self.mount_a.write_n_mb("file_a", size_mb)
+        file_a_ino = self.mount_a.path_to_ino("file_a")
+
+        self.mount_a.write_n_mb("file_b", size_mb)
+        file_b_ino = self.mount_a.path_to_ino("file_b")
+
+        self.mount_a.run_shell(["ln", "file_b", "linkto_b"])
+        self.assertEqual(self.mount_a.path_to_ino("linkto_b"), file_b_ino)
+
+        # mv file_a file_b
+        self.mount_a.run_shell(["mv", "file_a", "file_b"])
+
+        # Stray reintegration should happen as a result of the notify_stray call on
+        # completion of rename
+        self.wait_until_equal(
+            lambda: self.get_mdc_stat("num_strays"),
+            expect_val=0,
+            timeout=60
+        )
+
+        self.assertEqual(self.get_mdc_stat("strays_created"), 1)
+        self.assertGreaterEqual(self.get_mdc_stat("strays_reintegrated"), 1)
+
+        # No data objects should have been deleted, as both files still have linkage.
+        self.assertTrue(self.fs.data_objects_present(file_a_ino, size_mb * 1024 * 1024))
+        self.assertTrue(self.fs.data_objects_present(file_b_ino, size_mb * 1024 * 1024))
+
+        self.fs.mds_asok(['flush', 'journal'])
+
+        post_reint_bt = self.fs.read_backtrace(file_b_ino)
+        self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "linkto_b")
+
+    def _setup_two_ranks(self):
+        # Set up two MDSs
+        self.fs.set_max_mds(2)
+
+        # See that we have two active MDSs
+        self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
+                              reject_fn=lambda v: v > 2 or v < 1)
+
+        active_mds_names = self.fs.get_active_names()
+        rank_0_id = active_mds_names[0]
+        rank_1_id = active_mds_names[1]
+        log.info("Ranks 0 and 1 are {0} and {1}".format(
+            rank_0_id, rank_1_id))
+
+        # Get rid of other MDS daemons so that it's easier to know which
+        # daemons to expect in which ranks after restarts
+        for unneeded_mds in set(self.mds_cluster.mds_ids) - {rank_0_id, rank_1_id}:
+            self.mds_cluster.mds_stop(unneeded_mds)
+            self.mds_cluster.mds_fail(unneeded_mds)
+
+        return rank_0_id, rank_1_id
+
+    def _force_migrate(self, path, rank=1):
+        """
+        :param to_id: MDS id to move it to
+        :param path: Filesystem path (string) to move
+        :return: None
+        """
+        self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", str(rank), path])
+        rpath = "/"+path
+        self._wait_subtrees([(rpath, rank)], rank=rank, path=rpath)
+
+    def _is_stopped(self, rank):
+        mds_map = self.fs.get_mds_map()
+        return rank not in [i['rank'] for i in mds_map['info'].values()]
+
+    def test_purge_on_shutdown(self):
+        """
+        That when an MDS rank is shut down, its purge queue is
+        drained in the process.
+        """
+        rank_0_id, rank_1_id = self._setup_two_ranks()
+
+        self.set_conf("mds.{0}".format(rank_1_id), 'mds_max_purge_files', "0")
+        self.mds_cluster.mds_fail_restart(rank_1_id)
+        self.fs.wait_for_daemons()
+
+        file_count = 5
+
+        self.mount_a.create_n_files("delete_me/file", file_count)
+
+        self._force_migrate("delete_me")
+
+        self.mount_a.run_shell(["rm", "-rf", Raw("delete_me/*")])
+        self.mount_a.umount_wait()
+
+        # See all the strays go into purge queue
+        self._wait_for_counter("mds_cache", "strays_created", file_count, mds_id=rank_1_id)
+        self._wait_for_counter("mds_cache", "strays_enqueued", file_count, mds_id=rank_1_id)
+        self.assertEqual(self.get_stat("mds_cache", "num_strays", mds_id=rank_1_id), 0)
+
+        # See nothing get purged from the purge queue (yet)
+        time.sleep(10)
+        self.assertEqual(self.get_stat("purge_queue", "pq_executed", mds_id=rank_1_id), 0)
+
+        # Shut down rank 1
+        self.fs.set_max_mds(1)
+
+        # It shouldn't proceed past stopping because its still not allowed
+        # to purge
+        time.sleep(10)
+        self.assertEqual(self.get_stat("purge_queue", "pq_executed", mds_id=rank_1_id), 0)
+        self.assertFalse(self._is_stopped(1))
+
+        # Permit the daemon to start purging again
+        self.fs.mon_manager.raw_cluster_cmd('tell', 'mds.{0}'.format(rank_1_id),
+                                            'injectargs',
+                                            "--mds_max_purge_files 100")
+
+        # It should now proceed through shutdown
+        self.fs.wait_for_daemons(timeout=120)
+
+        # ...and in the process purge all that data
+        self.await_data_pool_empty()
+
+    def test_migration_on_shutdown(self):
+        """
+        That when an MDS rank is shut down, any non-purgeable strays
+        get migrated to another rank.
+        """
+
+        rank_0_id, rank_1_id = self._setup_two_ranks()
+
+        # Create a non-purgeable stray in a ~mds1 stray directory
+        # by doing a hard link and deleting the original file
+        self.mount_a.run_shell_payload("""
+mkdir dir_1 dir_2
+touch dir_1/original
+ln dir_1/original dir_2/linkto
+""")
+
+        self._force_migrate("dir_1")
+        self._force_migrate("dir_2", rank=0)
+
+        # empty mds cache. otherwise mds reintegrates stray when unlink finishes
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(['flush', 'journal'], rank_1_id)
+        self.fs.mds_asok(['cache', 'drop'], rank_1_id)
+
+        self.mount_a.mount_wait()
+        self.mount_a.run_shell(["rm", "-f", "dir_1/original"])
+        self.mount_a.umount_wait()
+
+        self._wait_for_counter("mds_cache", "strays_created", 1,
+                               mds_id=rank_1_id)
+
+        # Shut down rank 1
+        self.fs.set_max_mds(1)
+        self.fs.wait_for_daemons(timeout=120)
+
+        # See that the stray counter on rank 0 has incremented
+        self.assertEqual(self.get_mdc_stat("strays_created", rank_0_id), 1)
+
+    def test_migrate_unlinked_dir(self):
+        """
+        Reproduce https://tracker.ceph.com/issues/53597
+        """
+        rank_0_id, rank_1_id = self._setup_two_ranks()
+
+        self.mount_a.run_shell_payload("""
+mkdir pin
+touch pin/placeholder
+""")
+
+        self._force_migrate("pin")
+
+        # Hold the dir open so it cannot be purged
+        p = self.mount_a.open_dir_background("pin/to-be-unlinked")
+
+        # Unlink the dentry
+        self.mount_a.run_shell(["rmdir", "pin/to-be-unlinked"])
+
+        # Wait to see the stray count increment
+        self.wait_until_equal(
+            lambda: self.get_mdc_stat("num_strays", mds_id=rank_1_id),
+            expect_val=1, timeout=60, reject_fn=lambda x: x > 1)
+        # but not purged
+        self.assertEqual(self.get_mdc_stat("strays_created", mds_id=rank_1_id), 1)
+        self.assertEqual(self.get_mdc_stat("strays_enqueued", mds_id=rank_1_id), 0)
+
+        # Test loading unlinked dir into cache
+        self.fs.mds_asok(['flush', 'journal'], rank_1_id)
+        self.fs.mds_asok(['cache', 'drop'], rank_1_id)
+
+        # Shut down rank 1
+        self.fs.set_max_mds(1)
+        self.fs.wait_for_daemons(timeout=120)
+        # Now the stray should be migrated to rank 0
+        # self.assertEqual(self.get_mdc_stat("strays_created", mds_id=rank_0_id), 1)
+        # https://github.com/ceph/ceph/pull/44335#issuecomment-1125940158
+
+        self.mount_a.kill_background(p)
+
+    def assert_backtrace(self, ino, expected_path):
+        """
+        Assert that the backtrace in the data pool for an inode matches
+        an expected /foo/bar path.
+        """
+        expected_elements = expected_path.strip("/").split("/")
+        bt = self.fs.read_backtrace(ino)
+        actual_elements = list(reversed([dn['dname'] for dn in bt['ancestors']]))
+        self.assertListEqual(expected_elements, actual_elements)
+
+    def get_backtrace_path(self, ino):
+        bt = self.fs.read_backtrace(ino)
+        elements = reversed([dn['dname'] for dn in bt['ancestors']])
+        return "/".join(elements)
+
+    def assert_purge_idle(self):
+        """
+        Assert that the MDS perf counters indicate no strays exist and
+        no ongoing purge activity.  Sanity check for when PurgeQueue should
+        be idle.
+        """
+        mdc_stats = self.fs.mds_asok(['perf', 'dump', "mds_cache"])['mds_cache']
+        pq_stats = self.fs.mds_asok(['perf', 'dump', "purge_queue"])['purge_queue']
+        self.assertEqual(mdc_stats["num_strays"], 0)
+        self.assertEqual(mdc_stats["num_strays_delayed"], 0)
+        self.assertEqual(pq_stats["pq_executing"], 0)
+        self.assertEqual(pq_stats["pq_executing_ops"], 0)
+
+    def test_mv_cleanup(self):
+        """
+        That when doing a rename from A to B, and B has no hardlinks,
+        then we make a stray for B and purge him.
+        """
+        # Create file_a and file_b, write some to both
+        size_mb = 8
+        self.mount_a.write_n_mb("file_a", size_mb)
+        file_a_ino = self.mount_a.path_to_ino("file_a")
+        self.mount_a.write_n_mb("file_b", size_mb)
+        file_b_ino = self.mount_a.path_to_ino("file_b")
+
+        self.fs.mds_asok(['flush', 'journal'])
+        self.assert_backtrace(file_a_ino, "file_a")
+        self.assert_backtrace(file_b_ino, "file_b")
+
+        # mv file_a file_b
+        self.mount_a.run_shell(['mv', 'file_a', 'file_b'])
+
+        # See that stray counter increments
+        self.assertEqual(self.get_mdc_stat("strays_created"), 1)
+        # Wait for purge counter to increment
+        self._wait_for_counter("mds_cache", "strays_enqueued", 1)
+        self._wait_for_counter("purge_queue", "pq_executed", 1)
+
+        self.assert_purge_idle()
+
+        # file_b should have been purged
+        self.assertTrue(self.fs.data_objects_absent(file_b_ino, size_mb * 1024 * 1024))
+
+        # Backtrace should have updated from file_a to file_b
+        self.fs.mds_asok(['flush', 'journal'])
+        self.assert_backtrace(file_a_ino, "file_b")
+
+        # file_a's data should still exist
+        self.assertTrue(self.fs.data_objects_present(file_a_ino, size_mb * 1024 * 1024))
+
+    def _pool_df(self, pool_name):
+        """
+        Return a dict like
+            {
+                "kb_used": 0,
+                "bytes_used": 0,
+                "max_avail": 19630292406,
+                "objects": 0
+            }
+
+        :param pool_name: Which pool (must exist)
+        """
+        out = self.fs.mon_manager.raw_cluster_cmd("df", "--format=json-pretty")
+        for p in json.loads(out)['pools']:
+            if p['name'] == pool_name:
+                return p['stats']
+
+        raise RuntimeError("Pool '{0}' not found".format(pool_name))
+
+    def await_data_pool_empty(self):
+        self.wait_until_true(
+            lambda: self._pool_df(
+                self.fs.get_data_pool_name()
+            )['objects'] == 0,
+            timeout=60)
+
+    def test_snapshot_remove(self):
+        """
+        That removal of a snapshot that references a now-unlinked file results
+        in purging on the stray for the file.
+        """
+        # Enable snapshots
+        self.fs.set_allow_new_snaps(True)
+
+        # Create a dir with a file in it
+        size_mb = 8
+        self.mount_a.run_shell(["mkdir", "snapdir"])
+        self.mount_a.run_shell(["mkdir", "snapdir/subdir"])
+        self.mount_a.write_test_pattern("snapdir/subdir/file_a", size_mb * 1024 * 1024)
+        file_a_ino = self.mount_a.path_to_ino("snapdir/subdir/file_a")
+
+        # Snapshot the dir
+        self.mount_a.run_shell(["mkdir", "snapdir/.snap/snap1"])
+
+        # Cause the head revision to deviate from the snapshot
+        self.mount_a.write_n_mb("snapdir/subdir/file_a", size_mb)
+
+        # Flush the journal so that backtraces, dirfrag objects will actually be written
+        self.fs.mds_asok(["flush", "journal"])
+
+        # Unlink the file
+        self.mount_a.run_shell(["rm", "-f", "snapdir/subdir/file_a"])
+        self.mount_a.run_shell(["rmdir", "snapdir/subdir"])
+
+        # Unmount the client because when I come back to check the data is still
+        # in the file I don't want to just see what's in the page cache.
+        self.mount_a.umount_wait()
+
+        self.assertEqual(self.get_mdc_stat("strays_created"), 2)
+
+        # FIXME: at this stage we see a purge and the stray count drops to
+        # zero, but there's actually still a stray, so at the very
+        # least the StrayManager stats code is slightly off
+
+        self.mount_a.mount_wait()
+
+        # See that the data from the snapshotted revision of the file is still present
+        # and correct
+        self.mount_a.validate_test_pattern("snapdir/.snap/snap1/subdir/file_a", size_mb * 1024 * 1024)
+
+        # Remove the snapshot
+        self.mount_a.run_shell(["rmdir", "snapdir/.snap/snap1"])
+
+        # Purging file_a doesn't happen until after we've flushed the journal, because
+        # it is referenced by the snapshotted subdir, and the snapshot isn't really
+        # gone until the journal references to it are gone
+        self.fs.mds_asok(["flush", "journal"])
+
+        # Wait for purging to complete, which requires the OSDMap to propagate to the OSDs.
+        # See also: http://tracker.ceph.com/issues/20072
+        self.wait_until_true(
+            lambda: self.fs.data_objects_absent(file_a_ino, size_mb * 1024 * 1024),
+            timeout=60
+        )
+
+        # See that a purge happens now
+        self._wait_for_counter("mds_cache", "strays_enqueued", 2)
+        self._wait_for_counter("purge_queue", "pq_executed", 2)
+
+        self.await_data_pool_empty()
+
+    def test_fancy_layout(self):
+        """
+        purge stray file with fancy layout
+        """
+
+        file_name = "fancy_layout_file"
+        self.mount_a.run_shell(["touch", file_name])
+
+        file_layout = "stripe_unit=1048576 stripe_count=4 object_size=8388608"
+        self.mount_a.setfattr(file_name, "ceph.file.layout", file_layout)
+
+        # 35MB requires 7 objects
+        size_mb = 35
+        self.mount_a.write_n_mb(file_name, size_mb)
+
+        self.mount_a.run_shell(["rm", "-f", file_name])
+        self.fs.mds_asok(["flush", "journal"])
+
+        # can't use self.fs.data_objects_absent here, it does not support fancy layout
+        self.await_data_pool_empty()
+
+    def test_dirfrag_limit(self):
+        """
+        That the directory fragment size cannot exceed mds_bal_fragment_size_max (using a limit of 50 in all configurations).
+        """
+
+        LOW_LIMIT = 50
+        self.config_set('mds', 'mds_bal_fragment_size_max', str(LOW_LIMIT))
+        time.sleep(10) # for config to reach MDS; async create is fast!!
+
+        try:
+            self.mount_a.create_n_files("subdir/file", LOW_LIMIT+1, finaldirsync=True)
+        except CommandFailedError:
+            pass # ENOSPC
+        else:
+            self.fail("fragment size exceeded")
+
+
+    def test_dirfrag_limit_fragmented(self):
+        """
+        That fragmentation (forced) will allow more entries to be created.
+        """
+
+        LOW_LIMIT = 50
+        self.config_set('mds', 'mds_bal_fragment_size_max', str(LOW_LIMIT))
+        self.config_set('mds', 'mds_bal_merge_size', 1) # disable merging
+        time.sleep(10) # for config to reach MDS; async create is fast!!
+
+        # Test that we can go beyond the limit if we fragment the directory
+        self.mount_a.create_n_files("subdir/file", LOW_LIMIT, finaldirsync=True)
+        self.mount_a.umount_wait() # release client caps
+
+        # Ensure that subdir is fragmented
+        self.fs.rank_asok(["dirfrag", "split", "/subdir", "0/0", "1"])
+        self.fs.rank_asok(["flush", "journal"])
+
+        # Create 50% more files than the current fragment limit
+        self.mount_a.mount_wait()
+        self.mount_a.create_n_files("subdir/file", (LOW_LIMIT*3)//2, finaldirsync=True)
+
+    def test_dirfrag_limit_strays(self):
+        """
+        That unlinking fails when the stray directory fragment becomes too
+        large and that unlinking may continue once those strays are purged.
+        """
+
+        LOW_LIMIT = 10
+        # N.B. this test is inherently racy because stray removal may be faster
+        # than slow(er) file creation.
+        self.config_set('mds', 'mds_bal_fragment_size_max', LOW_LIMIT)
+        time.sleep(10) # for config to reach MDS; async create is fast!!
+
+        # Now test the stray directory size is limited and recovers
+        strays_before = self.get_mdc_stat("strays_created")
+        try:
+            # 10 stray directories: expect collisions
+            self.mount_a.create_n_files("subdir/file", LOW_LIMIT*10, finaldirsync=True, unlink=True)
+        except CommandFailedError:
+            pass # ENOSPC
+        else:
+            self.fail("fragment size exceeded")
+        strays_after = self.get_mdc_stat("strays_created")
+        self.assertGreaterEqual(strays_after-strays_before, LOW_LIMIT)
+
+        self._wait_for_counter("mds_cache", "strays_enqueued", strays_after)
+        self._wait_for_counter("purge_queue", "pq_executed", strays_after)
+
+        # verify new files can be created and unlinked
+        self.mount_a.create_n_files("subdir/file", LOW_LIMIT, dirsync=True, unlink=True)
+
+    def test_purge_queue_upgrade(self):
+        """
+        That when starting on a system with no purge queue in the metadata
+        pool, we silently create one.
+        :return:
+        """
+
+        self.mds_cluster.mds_stop()
+        self.mds_cluster.mds_fail()
+        self.fs.radosm(["rm", "500.00000000"])
+        self.mds_cluster.mds_restart()
+        self.fs.wait_for_daemons()
+
+    def test_replicated_delete_speed(self):
+        """
+        That deletions of replicated metadata are not pathologically slow
+        """
+        rank_0_id, rank_1_id = self._setup_two_ranks()
+
+        self.set_conf("mds.{0}".format(rank_1_id), 'mds_max_purge_files', "0")
+        self.mds_cluster.mds_fail_restart(rank_1_id)
+        self.fs.wait_for_daemons()
+
+        file_count = 10
+
+        self.mount_a.create_n_files("delete_me/file", file_count)
+
+        self._force_migrate("delete_me")
+
+        begin = datetime.datetime.now()
+        self.mount_a.run_shell(["rm", "-rf", Raw("delete_me/*")])
+        end = datetime.datetime.now()
+
+        # What we're really checking here is that we are completing client
+        # operations immediately rather than delaying until the next tick.
+        tick_period = float(self.fs.get_config("mds_tick_interval",
+                                               service_type="mds"))
+
+        duration = (end - begin).total_seconds()
+        self.assertLess(duration, (file_count * tick_period) * 0.25)
diff --git a/qa/tasks/cephfs/test_volume_client.py b/qa/tasks/cephfs/test_volume_client.py
new file mode 100644
index 000000000..d1b2e760c
--- /dev/null
+++ b/qa/tasks/cephfs/test_volume_client.py
@@ -0,0 +1,1735 @@
+from io import StringIO
+import json
+import logging
+import os
+from textwrap import dedent
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from tasks.cephfs.fuse_mount import FuseMount
+from teuthology.exceptions import CommandFailedError
+
+log = logging.getLogger(__name__)
+
+
+class TestVolumeClient(CephFSTestCase):
+    # One for looking at the global filesystem, one for being
+    # the VolumeClient, two for mounting the created shares
+    CLIENTS_REQUIRED = 4
+
+    def setUp(self):
+        CephFSTestCase.setUp(self)
+
+    def _volume_client_python(self, client, script, vol_prefix=None, ns_prefix=None):
+        # Can't dedent this *and* the script we pass in, because they might have different
+        # levels of indentation to begin with, so leave this string zero-indented
+        if vol_prefix:
+            vol_prefix = "\"" + vol_prefix + "\""
+        if ns_prefix:
+            ns_prefix = "\"" + ns_prefix + "\""
+        return client.run_python("""
+from __future__ import print_function
+from ceph_volume_client import CephFSVolumeClient, VolumePath
+from sys import version_info as sys_version_info
+from rados import OSError as rados_OSError
+import logging
+log = logging.getLogger("ceph_volume_client")
+log.addHandler(logging.StreamHandler())
+log.setLevel(logging.DEBUG)
+vc = CephFSVolumeClient("manila", "{conf_path}", "ceph", {vol_prefix}, {ns_prefix})
+vc.connect()
+{payload}
+vc.disconnect()
+        """.format(payload=script, conf_path=client.config_path,
+                   vol_prefix=vol_prefix, ns_prefix=ns_prefix))
+
+    def _configure_vc_auth(self, mount, id_name):
+        """
+        Set up auth credentials for the VolumeClient user
+        """
+        out = self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "get-or-create", "client.{name}".format(name=id_name),
+            "mds", "allow *",
+            "osd", "allow rw",
+            "mon", "allow *"
+        )
+        mount.client_id = id_name
+        mount.client_remote.write_file(mount.get_keyring_path(),
+                                       out, sudo=True)
+        self.set_conf("client.{name}".format(name=id_name), "keyring", mount.get_keyring_path())
+
+    def _configure_guest_auth(self, volumeclient_mount, guest_mount,
+                              guest_entity, cephfs_mntpt,
+                              namespace_prefix=None, readonly=False,
+                              tenant_id=None, allow_existing_id=False):
+        """
+        Set up auth credentials for the guest client to mount a volume.
+
+        :param volumeclient_mount: mount used as the handle for driving
+                                   volumeclient.
+        :param guest_mount: mount used by the guest client.
+        :param guest_entity: auth ID used by the guest client.
+        :param cephfs_mntpt: path of the volume.
+        :param namespace_prefix: name prefix of the RADOS namespace, which
+                                 is used for the volume's layout.
+        :param readonly: defaults to False. If set to 'True' only read-only
+                         mount access is granted to the guest.
+        :param tenant_id: (OpenStack) tenant ID of the guest client.
+        """
+
+        head, volume_id = os.path.split(cephfs_mntpt)
+        head, group_id = os.path.split(head)
+        head, volume_prefix = os.path.split(head)
+        volume_prefix = "/" + volume_prefix
+
+        # Authorize the guest client's auth ID to mount the volume.
+        key = self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            auth_result = vc.authorize(vp, "{guest_entity}", readonly={readonly},
+                                       tenant_id="{tenant_id}",
+                                       allow_existing_id="{allow_existing_id}")
+            print(auth_result['auth_key'])
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity=guest_entity,
+            readonly=readonly,
+            tenant_id=tenant_id,
+            allow_existing_id=allow_existing_id)), volume_prefix, namespace_prefix
+        )
+
+        # CephFSVolumeClient's authorize() does not return the secret
+        # key to a caller who isn't multi-tenant aware. Explicitly
+        # query the key for such a client.
+        if not tenant_id:
+            key = self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "get-key", "client.{name}".format(name=guest_entity),
+            )
+
+        # The guest auth ID should exist.
+        existing_ids = [a['entity'] for a in self.auth_list()]
+        self.assertIn("client.{0}".format(guest_entity), existing_ids)
+
+        # Create keyring file for the guest client.
+        keyring_txt = dedent("""
+        [client.{guest_entity}]
+            key = {key}
+
+        """.format(
+            guest_entity=guest_entity,
+            key=key
+        ))
+        guest_mount.client_id = guest_entity
+        guest_mount.client_remote.write_file(guest_mount.get_keyring_path(),
+                                             keyring_txt, sudo=True)
+
+        # Add a guest client section to the ceph config file.
+        self.set_conf("client.{0}".format(guest_entity), "client quota", "True")
+        self.set_conf("client.{0}".format(guest_entity), "debug client", "20")
+        self.set_conf("client.{0}".format(guest_entity), "debug objecter", "20")
+        self.set_conf("client.{0}".format(guest_entity),
+                      "keyring", guest_mount.get_keyring_path())
+
+    def test_default_prefix(self):
+        group_id = "grpid"
+        volume_id = "volid"
+        DEFAULT_VOL_PREFIX = "volumes"
+        DEFAULT_NS_PREFIX = "fsvolumens_"
+
+        self.mount_b.umount_wait()
+        self._configure_vc_auth(self.mount_b, "manila")
+
+        #create a volume with default prefix
+        self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 10, data_isolated=True)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        # The dir should be created
+        self.mount_a.stat(os.path.join(DEFAULT_VOL_PREFIX, group_id, volume_id))
+
+        #namespace should be set
+        ns_in_attr = self.mount_a.getfattr(os.path.join(DEFAULT_VOL_PREFIX, group_id, volume_id), "ceph.dir.layout.pool_namespace")
+        namespace = "{0}{1}".format(DEFAULT_NS_PREFIX, volume_id)
+        self.assertEqual(namespace, ns_in_attr)
+
+
+    def test_lifecycle(self):
+        """
+        General smoke test for create, extend, destroy
+        """
+
+        # I'm going to use mount_c later as a guest for mounting the created
+        # shares
+        self.mounts[2].umount_wait()
+
+        # I'm going to leave mount_b unmounted and just use it as a handle for
+        # driving volumeclient.  It's a little hacky but we don't have a more
+        # general concept for librados/libcephfs clients as opposed to full
+        # blown mounting clients.
+        self.mount_b.umount_wait()
+        self._configure_vc_auth(self.mount_b, "manila")
+
+        guest_entity = "guest"
+        group_id = "grpid"
+        volume_id = "volid"
+
+        volume_prefix = "/myprefix"
+        namespace_prefix = "mynsprefix_"
+
+        # Create a 100MB volume
+        volume_size = 100
+        cephfs_mntpt = self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 1024*1024*{volume_size})
+            print(create_result['mount_path'])
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            volume_size=volume_size
+        )), volume_prefix, namespace_prefix)
+
+        # The dir should be created
+        self.mount_a.stat(os.path.join("myprefix", group_id, volume_id))
+
+        # Authorize and configure credentials for the guest to mount the
+        # the volume.
+        self._configure_guest_auth(self.mount_b, self.mounts[2], guest_entity,
+                                   cephfs_mntpt, namespace_prefix)
+        self.mounts[2].mount_wait(cephfs_mntpt=cephfs_mntpt)
+
+        # The kernel client doesn't have the quota-based df behaviour,
+        # or quotas at all, so only exercise the client behaviour when
+        # running fuse.
+        if isinstance(self.mounts[2], FuseMount):
+            # df should see volume size, same as the quota set on volume's dir
+            self.assertEqual(self.mounts[2].df()['total'],
+                             volume_size * 1024 * 1024)
+            self.assertEqual(
+                    self.mount_a.getfattr(
+                        os.path.join(volume_prefix.strip("/"), group_id, volume_id),
+                        "ceph.quota.max_bytes"),
+                    "%s" % (volume_size * 1024 * 1024))
+
+            # df granularity is 4MB block so have to write at least that much
+            data_bin_mb = 4
+            self.mounts[2].write_n_mb("data.bin", data_bin_mb)
+
+            # Write something outside volume to check this space usage is
+            # not reported in the volume's DF.
+            other_bin_mb = 8
+            self.mount_a.write_n_mb("other.bin", other_bin_mb)
+
+            # global: df should see all the writes (data + other).  This is a >
+            # rather than a == because the global spaced used includes all pools
+            def check_df():
+                used = self.mount_a.df()['used']
+                return used >= (other_bin_mb * 1024 * 1024)
+
+            self.wait_until_true(check_df, timeout=30)
+
+            # Hack: do a metadata IO to kick rstats
+            self.mounts[2].run_shell(["touch", "foo"])
+
+            # volume: df should see the data_bin_mb consumed from quota, same
+            # as the rbytes for the volume's dir
+            self.wait_until_equal(
+                    lambda: self.mounts[2].df()['used'],
+                    data_bin_mb * 1024 * 1024, timeout=60)
+            self.wait_until_equal(
+                    lambda: self.mount_a.getfattr(
+                        os.path.join(volume_prefix.strip("/"), group_id, volume_id),
+                        "ceph.dir.rbytes"),
+                    "%s" % (data_bin_mb * 1024 * 1024), timeout=60)
+
+            # sync so that file data are persist to rados
+            self.mounts[2].run_shell(["sync"])
+
+            # Our data should stay in particular rados namespace
+            pool_name = self.mount_a.getfattr(os.path.join("myprefix", group_id, volume_id), "ceph.dir.layout.pool")
+            namespace = "{0}{1}".format(namespace_prefix, volume_id)
+            ns_in_attr = self.mount_a.getfattr(os.path.join("myprefix", group_id, volume_id), "ceph.dir.layout.pool_namespace")
+            self.assertEqual(namespace, ns_in_attr)
+
+            objects_in_ns = set(self.fs.rados(["ls"], pool=pool_name, namespace=namespace, stdout=StringIO()).stdout.getvalue().split("\n"))
+            self.assertNotEqual(objects_in_ns, set())
+
+            # De-authorize the guest
+            self._volume_client_python(self.mount_b, dedent("""
+                vp = VolumePath("{group_id}", "{volume_id}")
+                vc.deauthorize(vp, "{guest_entity}")
+                vc.evict("{guest_entity}")
+            """.format(
+                group_id=group_id,
+                volume_id=volume_id,
+                guest_entity=guest_entity
+            )), volume_prefix, namespace_prefix)
+
+            # Once deauthorized, the client should be unable to do any more metadata ops
+            # The way that the client currently behaves here is to block (it acts like
+            # it has lost network, because there is nothing to tell it that is messages
+            # are being dropped because it's identity is gone)
+            background = self.mounts[2].write_n_mb("rogue.bin", 1, wait=False)
+            try:
+                background.wait()
+            except CommandFailedError:
+                # command failed with EBLOCKLISTED?
+                if "transport endpoint shutdown" in background.stderr.getvalue():
+                    pass
+                else:
+                    raise
+
+            # After deauthorisation, the client ID should be gone (this was the only
+            # volume it was authorised for)
+            self.assertNotIn("client.{0}".format(guest_entity), [e['entity'] for e in self.auth_list()])
+
+            # Clean up the dead mount (ceph-fuse's behaviour here is a bit undefined)
+            self.mounts[2].umount_wait()
+
+        self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+            vc.purge_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )), volume_prefix, namespace_prefix)
+
+    def test_idempotency(self):
+        """
+        That the volumeclient interface works when calling everything twice
+        """
+        self.mount_b.umount_wait()
+        self._configure_vc_auth(self.mount_b, "manila")
+
+        guest_entity = "guest"
+        group_id = "grpid"
+        volume_id = "volid"
+        self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 10)
+            vc.create_volume(vp, 10)
+            vc.authorize(vp, "{guest_entity}")
+            vc.authorize(vp, "{guest_entity}")
+            vc.deauthorize(vp, "{guest_entity}")
+            vc.deauthorize(vp, "{guest_entity}")
+            vc.delete_volume(vp)
+            vc.delete_volume(vp)
+            vc.purge_volume(vp)
+            vc.purge_volume(vp)
+
+            vc.create_volume(vp, 10, data_isolated=True)
+            vc.create_volume(vp, 10, data_isolated=True)
+            vc.authorize(vp, "{guest_entity}")
+            vc.authorize(vp, "{guest_entity}")
+            vc.deauthorize(vp, "{guest_entity}")
+            vc.deauthorize(vp, "{guest_entity}")
+            vc.evict("{guest_entity}")
+            vc.evict("{guest_entity}")
+            vc.delete_volume(vp, data_isolated=True)
+            vc.delete_volume(vp, data_isolated=True)
+            vc.purge_volume(vp, data_isolated=True)
+            vc.purge_volume(vp, data_isolated=True)
+
+            vc.create_volume(vp, 10, namespace_isolated=False)
+            vc.create_volume(vp, 10, namespace_isolated=False)
+            vc.authorize(vp, "{guest_entity}")
+            vc.authorize(vp, "{guest_entity}")
+            vc.deauthorize(vp, "{guest_entity}")
+            vc.deauthorize(vp, "{guest_entity}")
+            vc.evict("{guest_entity}")
+            vc.evict("{guest_entity}")
+            vc.delete_volume(vp)
+            vc.delete_volume(vp)
+            vc.purge_volume(vp)
+            vc.purge_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity=guest_entity
+        )))
+
+    def test_data_isolated(self):
+        """
+        That data isolated shares get their own pool
+        :return:
+        """
+
+        self.mount_b.umount_wait()
+        self._configure_vc_auth(self.mount_b, "manila")
+
+        pools_a = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools']
+
+        group_id = "grpid"
+        volume_id = "volid"
+        self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, data_isolated=True)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        pools_b = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools']
+
+        # Should have created one new pool
+        new_pools = set(p['pool_name'] for p in pools_b) - set([p['pool_name'] for p in pools_a])
+        self.assertEqual(len(new_pools), 1)
+
+    def test_15303(self):
+        """
+        Reproducer for #15303 "Client holds incorrect complete flag on dir
+        after losing caps" (http://tracker.ceph.com/issues/15303)
+        """
+        for m in self.mounts:
+            m.umount_wait()
+
+        # Create a dir on mount A
+        self.mount_a.mount_wait()
+        self.mount_a.run_shell(["mkdir", "parent1"])
+        self.mount_a.run_shell(["mkdir", "parent2"])
+        self.mount_a.run_shell(["mkdir", "parent1/mydir"])
+
+        # Put some files in it from mount B
+        self.mount_b.mount_wait()
+        self.mount_b.run_shell(["touch", "parent1/mydir/afile"])
+        self.mount_b.umount_wait()
+
+        # List the dir's contents on mount A
+        self.assertListEqual(self.mount_a.ls("parent1/mydir"),
+                             ["afile"])
+
+    def test_evict_client(self):
+        """
+        That a volume client can be evicted based on its auth ID and the volume
+        path it has mounted.
+        """
+
+        if not isinstance(self.mount_a, FuseMount):
+            self.skipTest("Requires FUSE client to inject client metadata")
+
+        # mounts[1] would be used as handle for driving VolumeClient. mounts[2]
+        # and mounts[3] would be used as guests to mount the volumes/shares.
+
+        for i in range(1, 4):
+            self.mounts[i].umount_wait()
+
+        volumeclient_mount = self.mounts[1]
+        self._configure_vc_auth(volumeclient_mount, "manila")
+        guest_mounts = (self.mounts[2], self.mounts[3])
+
+        guest_entity = "guest"
+        group_id = "grpid"
+        cephfs_mntpts = []
+        volume_ids = []
+
+        # Create two volumes. Authorize 'guest' auth ID to mount the two
+        # volumes. Mount the two volumes. Write data to the volumes.
+        for i in range(2):
+            # Create volume.
+            volume_ids.append("volid_{0}".format(str(i)))
+            cephfs_mntpts.append(
+                self._volume_client_python(volumeclient_mount, dedent("""
+                    vp = VolumePath("{group_id}", "{volume_id}")
+                    create_result = vc.create_volume(vp, 10 * 1024 * 1024)
+                    print(create_result['mount_path'])
+                """.format(
+                    group_id=group_id,
+                    volume_id=volume_ids[i]
+            ))))
+
+            # Authorize 'guest' auth ID to mount the volume.
+            self._configure_guest_auth(volumeclient_mount, guest_mounts[i],
+                                       guest_entity, cephfs_mntpts[i])
+
+            # Mount the volume.
+            guest_mounts[i].mountpoint_dir_name = 'mnt.{id}.{suffix}'.format(
+                id=guest_entity, suffix=str(i))
+            guest_mounts[i].mount_wait(cephfs_mntpt=cephfs_mntpts[i])
+            guest_mounts[i].write_n_mb("data.bin", 1)
+
+
+        # Evict client, guest_mounts[0], using auth ID 'guest' and has mounted
+        # one volume.
+        self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity}")
+            vc.evict("{guest_entity}", volume_path=vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_ids[0],
+            guest_entity=guest_entity
+        )))
+
+        # Evicted guest client, guest_mounts[0], should not be able to do
+        # anymore metadata ops.  It should start failing all operations
+        # when it sees that its own address is in the blocklist.
+        try:
+            guest_mounts[0].write_n_mb("rogue.bin", 1)
+        except CommandFailedError:
+            pass
+        else:
+            raise RuntimeError("post-eviction write should have failed!")
+
+        # The blocklisted guest client should now be unmountable
+        guest_mounts[0].umount_wait()
+
+        # Guest client, guest_mounts[1], using the same auth ID 'guest', but
+        # has mounted the other volume, should be able to use its volume
+        # unaffected.
+        guest_mounts[1].write_n_mb("data.bin.1", 1)
+
+        # Cleanup.
+        for i in range(2):
+            self._volume_client_python(volumeclient_mount, dedent("""
+                vp = VolumePath("{group_id}", "{volume_id}")
+                vc.deauthorize(vp, "{guest_entity}")
+                vc.delete_volume(vp)
+                vc.purge_volume(vp)
+            """.format(
+                group_id=group_id,
+                volume_id=volume_ids[i],
+                guest_entity=guest_entity
+            )))
+
+
+    def test_purge(self):
+        """
+        Reproducer for #15266, exception trying to purge volumes that
+        contain non-ascii filenames.
+
+        Additionally test any other purge corner cases here.
+        """
+        # I'm going to leave mount_b unmounted and just use it as a handle for
+        # driving volumeclient.  It's a little hacky but we don't have a more
+        # general concept for librados/libcephfs clients as opposed to full
+        # blown mounting clients.
+        self.mount_b.umount_wait()
+        self._configure_vc_auth(self.mount_b, "manila")
+
+        group_id = "grpid"
+        # Use a unicode volume ID (like Manila), to reproduce #15266
+        volume_id = u"volid"
+
+        # Create
+        cephfs_mntpt = self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", u"{volume_id}")
+            create_result = vc.create_volume(vp, 10)
+            print(create_result['mount_path'])
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id
+        )))
+
+        # Strip leading "/"
+        cephfs_mntpt = cephfs_mntpt[1:]
+
+        # A file with non-ascii characters
+        self.mount_a.run_shell(["touch", os.path.join(cephfs_mntpt, u"b\u00F6b")])
+
+        # A file with no permissions to do anything
+        self.mount_a.run_shell(["touch", os.path.join(cephfs_mntpt, "noperms")])
+        self.mount_a.run_shell(["chmod", "0000", os.path.join(cephfs_mntpt, "noperms")])
+
+        self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", u"{volume_id}")
+            vc.delete_volume(vp)
+            vc.purge_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id
+        )))
+
+        # Check it's really gone
+        self.assertEqual(self.mount_a.ls("volumes/_deleting"), [])
+        self.assertEqual(self.mount_a.ls("volumes/"), ["_deleting", group_id])
+
+    def test_readonly_authorization(self):
+        """
+        That guest clients can be restricted to read-only mounts of volumes.
+        """
+
+        volumeclient_mount = self.mounts[1]
+        guest_mount = self.mounts[2]
+        volumeclient_mount.umount_wait()
+        guest_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        guest_entity = "guest"
+        group_id = "grpid"
+        volume_id = "volid"
+
+        # Create a volume.
+        cephfs_mntpt = self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 1024*1024*10)
+            print(create_result['mount_path'])
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        # Authorize and configure credentials for the guest to mount the
+        # the volume with read-write access.
+        self._configure_guest_auth(volumeclient_mount, guest_mount,
+                                   guest_entity, cephfs_mntpt, readonly=False)
+
+        # Mount the volume, and write to it.
+        guest_mount.mount_wait(cephfs_mntpt=cephfs_mntpt)
+        guest_mount.write_n_mb("data.bin", 1)
+
+        # Change the guest auth ID's authorization to read-only mount access.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity=guest_entity
+        )))
+        self._configure_guest_auth(volumeclient_mount, guest_mount, guest_entity,
+                                   cephfs_mntpt, readonly=True)
+
+        # The effect of the change in access level to read-only is not
+        # immediate. The guest sees the change only after a remount of
+        # the volume.
+        guest_mount.umount_wait()
+        guest_mount.mount_wait(cephfs_mntpt=cephfs_mntpt)
+
+        # Read existing content of the volume.
+        self.assertListEqual(guest_mount.ls(guest_mount.mountpoint), ["data.bin"])
+        # Cannot write into read-only volume.
+        try:
+            guest_mount.write_n_mb("rogue.bin", 1)
+        except CommandFailedError:
+            pass
+
+    def test_get_authorized_ids(self):
+        """
+        That for a volume, the authorized IDs and their access levels
+        can be obtained using CephFSVolumeClient's get_authorized_ids().
+        """
+        volumeclient_mount = self.mounts[1]
+        volumeclient_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        group_id = "grpid"
+        volume_id = "volid"
+        guest_entity_1 = "guest1"
+        guest_entity_2 = "guest2"
+
+        log.info("print(group ID: {0})".format(group_id))
+
+        # Create a volume.
+        auths = self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 1024*1024*10)
+            auths = vc.get_authorized_ids(vp)
+            print(auths)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+        # Check the list of authorized IDs for the volume.
+        self.assertEqual('None', auths)
+
+        # Allow two auth IDs access to the volume.
+        auths = self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{guest_entity_1}", readonly=False)
+            vc.authorize(vp, "{guest_entity_2}", readonly=True)
+            auths = vc.get_authorized_ids(vp)
+            print(auths)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity_1=guest_entity_1,
+            guest_entity_2=guest_entity_2,
+        )))
+        # Check the list of authorized IDs and their access levels.
+        expected_result = [('guest1', 'rw'), ('guest2', 'r')]
+        self.assertCountEqual(str(expected_result), auths)
+
+        # Disallow both the auth IDs' access to the volume.
+        auths = self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity_1}")
+            vc.deauthorize(vp, "{guest_entity_2}")
+            auths = vc.get_authorized_ids(vp)
+            print(auths)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity_1=guest_entity_1,
+            guest_entity_2=guest_entity_2,
+        )))
+        # Check the list of authorized IDs for the volume.
+        self.assertEqual('None', auths)
+
+    def test_multitenant_volumes(self):
+        """
+        That volume access can be restricted to a tenant.
+
+        That metadata used to enforce tenant isolation of
+        volumes is stored as a two-way mapping between auth
+        IDs and volumes that they're authorized to access.
+        """
+        volumeclient_mount = self.mounts[1]
+        volumeclient_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        group_id = "groupid"
+        volume_id = "volumeid"
+
+        # Guest clients belonging to different tenants, but using the same
+        # auth ID.
+        auth_id = "guest"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+        guestclient_2 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant2",
+        }
+
+        # Create a volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 1024*1024*10)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        # Check that volume metadata file is created on volume creation.
+        vol_metadata_filename = "_{0}:{1}.meta".format(group_id, volume_id)
+        self.assertIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Authorize 'guestclient_1', using auth ID 'guest' and belonging to
+        # 'tenant1', with 'rw' access to the volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            auth_id=guestclient_1["auth_id"],
+            tenant_id=guestclient_1["tenant_id"]
+        )))
+
+        # Check that auth metadata file for auth ID 'guest', is
+        # created on authorizing 'guest' access to the volume.
+        auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"])
+        self.assertIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Verify that the auth metadata file stores the tenant ID that the
+        # auth ID belongs to, the auth ID's authorized access levels
+        # for different volumes, versioning details, etc.
+        expected_auth_metadata = {
+            "version": 2,
+            "compat_version": 6,
+            "dirty": False,
+            "tenant_id": "tenant1",
+            "subvolumes": {
+                "groupid/volumeid": {
+                    "dirty": False,
+                    "access_level": "rw"
+                }
+            }
+        }
+
+        auth_metadata = self._volume_client_python(volumeclient_mount, dedent("""
+            import json
+            vp = VolumePath("{group_id}", "{volume_id}")
+            auth_metadata = vc._auth_metadata_get("{auth_id}")
+            print(json.dumps(auth_metadata))
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            auth_id=guestclient_1["auth_id"],
+        )))
+        auth_metadata = json.loads(auth_metadata)
+
+        self.assertGreaterEqual(auth_metadata["version"], expected_auth_metadata["version"])
+        del expected_auth_metadata["version"]
+        del auth_metadata["version"]
+        self.assertEqual(expected_auth_metadata, auth_metadata)
+
+        # Verify that the volume metadata file stores info about auth IDs
+        # and their access levels to the volume, versioning details, etc.
+        expected_vol_metadata = {
+            "version": 2,
+            "compat_version": 1,
+            "auths": {
+                "guest": {
+                    "dirty": False,
+                    "access_level": "rw"
+                }
+            }
+        }
+
+        vol_metadata = self._volume_client_python(volumeclient_mount, dedent("""
+            import json
+            vp = VolumePath("{group_id}", "{volume_id}")
+            volume_metadata = vc._volume_metadata_get(vp)
+            print(json.dumps(volume_metadata))
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+        vol_metadata = json.loads(vol_metadata)
+
+        self.assertGreaterEqual(vol_metadata["version"], expected_vol_metadata["version"])
+        del expected_vol_metadata["version"]
+        del vol_metadata["version"]
+        self.assertEqual(expected_vol_metadata, vol_metadata)
+
+        # Cannot authorize 'guestclient_2' to access the volume.
+        # It uses auth ID 'guest', which has already been used by a
+        # 'guestclient_1' belonging to an another tenant for accessing
+        # the volume.
+        with self.assertRaises(CommandFailedError):
+            self._volume_client_python(volumeclient_mount, dedent("""
+                vp = VolumePath("{group_id}", "{volume_id}")
+                vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+            """.format(
+                group_id=group_id,
+                volume_id=volume_id,
+                auth_id=guestclient_2["auth_id"],
+                tenant_id=guestclient_2["tenant_id"]
+            )))
+
+        # Check that auth metadata file is cleaned up on removing
+        # auth ID's only access to a volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity=guestclient_1["auth_id"]
+        )))
+
+        self.assertNotIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Check that volume metadata file is cleaned up on volume deletion.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+        self.assertNotIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
+
+    def test_authorize_auth_id_not_created_by_ceph_volume_client(self):
+        """
+        If the auth_id already exists and is not created by
+        ceph_volume_client, it's not allowed to authorize
+        the auth-id by default.
+        """
+        volumeclient_mount = self.mounts[1]
+        volumeclient_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        group_id = "groupid"
+        volume_id = "volumeid"
+
+        # Create auth_id
+        self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "get-or-create", "client.guest1",
+            "mds", "allow *",
+            "osd", "allow rw",
+            "mon", "allow *"
+        )
+
+        auth_id = "guest1"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # Create a volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 1024*1024*10)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        # Cannot authorize 'guestclient_1' to access the volume.
+        # It uses auth ID 'guest1', which already exists and not
+        # created by ceph_volume_client
+        with self.assertRaises(CommandFailedError):
+            self._volume_client_python(volumeclient_mount, dedent("""
+                vp = VolumePath("{group_id}", "{volume_id}")
+                vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+            """.format(
+                group_id=group_id,
+                volume_id=volume_id,
+                auth_id=guestclient_1["auth_id"],
+                tenant_id=guestclient_1["tenant_id"]
+            )))
+
+        # Delete volume
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+    def test_authorize_allow_existing_id_option(self):
+        """
+        If the auth_id already exists and is not created by
+        ceph_volume_client, it's not allowed to authorize
+        the auth-id by default but is allowed with option
+        allow_existing_id.
+        """
+        volumeclient_mount = self.mounts[1]
+        volumeclient_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        group_id = "groupid"
+        volume_id = "volumeid"
+
+        # Create auth_id
+        self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "get-or-create", "client.guest1",
+            "mds", "allow *",
+            "osd", "allow rw",
+            "mon", "allow *"
+        )
+
+        auth_id = "guest1"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # Create a volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 1024*1024*10)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        # Cannot authorize 'guestclient_1' to access the volume
+        # by default, which already exists and not created by
+        # ceph_volume_client but is allowed with option 'allow_existing_id'.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}",
+                         allow_existing_id="{allow_existing_id}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            auth_id=guestclient_1["auth_id"],
+            tenant_id=guestclient_1["tenant_id"],
+            allow_existing_id=True
+        )))
+
+        # Delete volume
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+    def test_deauthorize_auth_id_after_out_of_band_update(self):
+        """
+        If the auth_id authorized by ceph_volume_client is updated
+        out of band, the auth_id should not be deleted after a
+        deauthorize. It should only remove caps associated it.
+        """
+        volumeclient_mount = self.mounts[1]
+        volumeclient_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        group_id = "groupid"
+        volume_id = "volumeid"
+
+
+        auth_id = "guest1"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # Create a volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 1024*1024*10)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        # Authorize 'guestclient_1' to access the volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            auth_id=guestclient_1["auth_id"],
+            tenant_id=guestclient_1["tenant_id"]
+        )))
+
+        # Update caps for guestclient_1 out of band
+        out = self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "caps", "client.guest1",
+            "mds", "allow rw path=/volumes/groupid, allow rw path=/volumes/groupid/volumeid",
+            "osd", "allow rw pool=cephfs_data namespace=fsvolumens_volumeid",
+            "mon", "allow r",
+            "mgr", "allow *"
+        )
+
+        # Deauthorize guestclient_1
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity=guestclient_1["auth_id"]
+        )))
+
+        # Validate the caps of guestclient_1 after deauthorize. It should not have deleted
+        # guestclient_1. The mgr and mds caps should be present which was updated out of band.
+        out = json.loads(self.fs.mon_manager.raw_cluster_cmd("auth", "get", "client.guest1", "--format=json-pretty"))
+
+        self.assertEqual("client.guest1", out[0]["entity"])
+        self.assertEqual("allow rw path=/volumes/groupid", out[0]["caps"]["mds"])
+        self.assertEqual("allow *", out[0]["caps"]["mgr"])
+        self.assertNotIn("osd", out[0]["caps"])
+
+        # Delete volume
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+    def test_recover_metadata(self):
+        """
+        That volume client can recover from partial auth updates using
+        metadata files, which store auth info and its update status info.
+        """
+        volumeclient_mount = self.mounts[1]
+        volumeclient_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        group_id = "groupid"
+        volume_id = "volumeid"
+
+        guestclient = {
+            "auth_id": "guest",
+            "tenant_id": "tenant",
+        }
+
+        # Create a volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 1024*1024*10)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        # Authorize 'guestclient' access to the volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            auth_id=guestclient["auth_id"],
+            tenant_id=guestclient["tenant_id"]
+        )))
+
+        # Check that auth metadata file for auth ID 'guest' is created.
+        auth_metadata_filename = "${0}.meta".format(guestclient["auth_id"])
+        self.assertIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Induce partial auth update state by modifying the auth metadata file,
+        # and then run recovery procedure.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            auth_metadata = vc._auth_metadata_get("{auth_id}")
+            auth_metadata['dirty'] = True
+            vc._auth_metadata_set("{auth_id}", auth_metadata)
+            vc.recover()
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            auth_id=guestclient["auth_id"],
+        )))
+
+    def test_update_old_style_auth_metadata_to_new_during_recover(self):
+        """
+        From nautilus onwards 'volumes' created by ceph_volume_client were
+        renamed and used as CephFS subvolumes accessed via the ceph-mgr
+        interface. Hence it makes sense to store the subvolume data in
+        auth-metadata file with 'subvolumes' key instead of 'volumes' key.
+        This test validates the transparent update of 'volumes' key to
+        'subvolumes' key in auth metadata file during recover.
+        """
+        volumeclient_mount = self.mounts[1]
+        volumeclient_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        group_id = "groupid"
+        volume_id = "volumeid"
+
+        guestclient = {
+            "auth_id": "guest",
+            "tenant_id": "tenant",
+        }
+
+        # Create a volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 1024*1024*10)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        # Check that volume metadata file is created on volume creation.
+        vol_metadata_filename = "_{0}:{1}.meta".format(group_id, volume_id)
+        self.assertIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Authorize 'guestclient' access to the volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            auth_id=guestclient["auth_id"],
+            tenant_id=guestclient["tenant_id"]
+        )))
+
+        # Check that auth metadata file for auth ID 'guest' is created.
+        auth_metadata_filename = "${0}.meta".format(guestclient["auth_id"])
+        self.assertIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Replace 'subvolumes' to 'volumes', old style auth-metadata file
+        self.mounts[0].run_shell(['sed', '-i', 's/subvolumes/volumes/g', 'volumes/{0}'.format(auth_metadata_filename)])
+
+        # Verify that the auth metadata file stores the tenant ID that the
+        # auth ID belongs to, the auth ID's authorized access levels
+        # for different volumes, versioning details, etc.
+        expected_auth_metadata = {
+            "version": 2,
+            "compat_version": 6,
+            "dirty": False,
+            "tenant_id": "tenant",
+            "subvolumes": {
+                "groupid/volumeid": {
+                    "dirty": False,
+                    "access_level": "rw"
+                }
+            }
+        }
+
+        # Induce partial auth update state by modifying the auth metadata file,
+        # and then run recovery procedure. This should also update 'volumes' key
+        # to 'subvolumes'.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            auth_metadata = vc._auth_metadata_get("{auth_id}")
+            auth_metadata['dirty'] = True
+            vc._auth_metadata_set("{auth_id}", auth_metadata)
+            vc.recover()
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            auth_id=guestclient["auth_id"],
+        )))
+
+        auth_metadata = self._volume_client_python(volumeclient_mount, dedent("""
+            import json
+            auth_metadata = vc._auth_metadata_get("{auth_id}")
+            print(json.dumps(auth_metadata))
+        """.format(
+            auth_id=guestclient["auth_id"],
+        )))
+        auth_metadata = json.loads(auth_metadata)
+
+        self.assertGreaterEqual(auth_metadata["version"], expected_auth_metadata["version"])
+        del expected_auth_metadata["version"]
+        del auth_metadata["version"]
+        self.assertEqual(expected_auth_metadata, auth_metadata)
+
+        # Check that auth metadata file is cleaned up on removing
+        # auth ID's access to volumes 'volumeid'.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity=guestclient["auth_id"]
+        )))
+        self.assertNotIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Check that volume metadata file is cleaned up on volume deletion.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+        self.assertNotIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
+
+    def test_update_old_style_auth_metadata_to_new_during_authorize(self):
+        """
+        From nautilus onwards 'volumes' created by ceph_volume_client were
+        renamed and used as CephFS subvolumes accessed via the ceph-mgr
+        interface. Hence it makes sense to store the subvolume data in
+        auth-metadata file with 'subvolumes' key instead of 'volumes' key.
+        This test validates the transparent update of 'volumes' key to
+        'subvolumes' key in auth metadata file during authorize.
+        """
+        volumeclient_mount = self.mounts[1]
+        volumeclient_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        group_id = "groupid"
+        volume_id1 = "volumeid1"
+        volume_id2 = "volumeid2"
+
+        auth_id = "guest"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # Create a volume volumeid1.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 10*1024*1024)
+            print(create_result['mount_path'])
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id1,
+        )))
+
+        # Create a volume volumeid2.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 10*1024*1024)
+            print(create_result['mount_path'])
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id2,
+        )))
+
+        # Check that volume metadata file is created on volume creation.
+        vol_metadata_filename = "_{0}:{1}.meta".format(group_id, volume_id1)
+        self.assertIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
+        vol_metadata_filename2 = "_{0}:{1}.meta".format(group_id, volume_id2)
+        self.assertIn(vol_metadata_filename2, self.mounts[0].ls("volumes"))
+
+        # Authorize 'guestclient_1', using auth ID 'guest' and belonging to
+        # 'tenant1', with 'rw' access to the volume 'volumeid1'.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id1,
+            auth_id=guestclient_1["auth_id"],
+            tenant_id=guestclient_1["tenant_id"]
+        )))
+
+        # Check that auth metadata file for auth ID 'guest', is
+        # created on authorizing 'guest' access to the volume.
+        auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"])
+        self.assertIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Replace 'subvolumes' to 'volumes', old style auth-metadata file
+        self.mounts[0].run_shell(['sed', '-i', 's/subvolumes/volumes/g', 'volumes/{0}'.format(auth_metadata_filename)])
+
+        # Authorize 'guestclient_1', using auth ID 'guest' and belonging to
+        # 'tenant1', with 'rw' access to the volume 'volumeid2'.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id2,
+            auth_id=guestclient_1["auth_id"],
+            tenant_id=guestclient_1["tenant_id"]
+        )))
+
+        # Verify that the auth metadata file stores the tenant ID that the
+        # auth ID belongs to, the auth ID's authorized access levels
+        # for different volumes, versioning details, etc.
+        expected_auth_metadata = {
+            "version": 2,
+            "compat_version": 6,
+            "dirty": False,
+            "tenant_id": "tenant1",
+            "subvolumes": {
+                "groupid/volumeid1": {
+                    "dirty": False,
+                    "access_level": "rw"
+                },
+                "groupid/volumeid2": {
+                    "dirty": False,
+                    "access_level": "rw"
+                }
+            }
+        }
+
+        auth_metadata = self._volume_client_python(volumeclient_mount, dedent("""
+            import json
+            auth_metadata = vc._auth_metadata_get("{auth_id}")
+            print(json.dumps(auth_metadata))
+        """.format(
+            auth_id=guestclient_1["auth_id"],
+        )))
+        auth_metadata = json.loads(auth_metadata)
+
+        self.assertGreaterEqual(auth_metadata["version"], expected_auth_metadata["version"])
+        del expected_auth_metadata["version"]
+        del auth_metadata["version"]
+        self.assertEqual(expected_auth_metadata, auth_metadata)
+
+        # Check that auth metadata file is cleaned up on removing
+        # auth ID's access to volumes 'volumeid1' and 'volumeid2'.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id1,
+            guest_entity=guestclient_1["auth_id"]
+        )))
+
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id2,
+            guest_entity=guestclient_1["auth_id"]
+        )))
+        self.assertNotIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Check that volume metadata file is cleaned up on volume deletion.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id1,
+        )))
+        self.assertNotIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Check that volume metadata file is cleaned up on volume deletion.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id2,
+        )))
+        self.assertNotIn(vol_metadata_filename2, self.mounts[0].ls("volumes"))
+
+    def test_update_old_style_auth_metadata_to_new_during_deauthorize(self):
+        """
+        From nautilus onwards 'volumes' created by ceph_volume_client were
+        renamed and used as CephFS subvolumes accessed via the ceph-mgr
+        interface. Hence it makes sense to store the subvolume data in
+        auth-metadata file with 'subvolumes' key instead of 'volumes' key.
+        This test validates the transparent update of 'volumes' key to
+        'subvolumes' key in auth metadata file during de-authorize.
+        """
+        volumeclient_mount = self.mounts[1]
+        volumeclient_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        group_id = "groupid"
+        volume_id1 = "volumeid1"
+        volume_id2 = "volumeid2"
+
+        auth_id = "guest"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # Create a volume volumeid1.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 10*1024*1024)
+            print(create_result['mount_path'])
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id1,
+        )))
+
+        # Create a volume volumeid2.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 10*1024*1024)
+            print(create_result['mount_path'])
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id2,
+        )))
+
+        # Check that volume metadata file is created on volume creation.
+        vol_metadata_filename = "_{0}:{1}.meta".format(group_id, volume_id1)
+        self.assertIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
+        vol_metadata_filename2 = "_{0}:{1}.meta".format(group_id, volume_id2)
+        self.assertIn(vol_metadata_filename2, self.mounts[0].ls("volumes"))
+
+        # Authorize 'guestclient_1', using auth ID 'guest' and belonging to
+        # 'tenant1', with 'rw' access to the volume 'volumeid1'.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id1,
+            auth_id=guestclient_1["auth_id"],
+            tenant_id=guestclient_1["tenant_id"]
+        )))
+
+        # Authorize 'guestclient_1', using auth ID 'guest' and belonging to
+        # 'tenant1', with 'rw' access to the volume 'volumeid2'.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id2,
+            auth_id=guestclient_1["auth_id"],
+            tenant_id=guestclient_1["tenant_id"]
+        )))
+
+        # Check that auth metadata file for auth ID 'guest', is
+        # created on authorizing 'guest' access to the volume.
+        auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"])
+        self.assertIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Replace 'subvolumes' to 'volumes', old style auth-metadata file
+        self.mounts[0].run_shell(['sed', '-i', 's/subvolumes/volumes/g', 'volumes/{0}'.format(auth_metadata_filename)])
+
+        # Deauthorize 'guestclient_1' to access 'volumeid2'. This should update
+        # 'volumes' key to 'subvolumes'
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id2,
+            guest_entity=guestclient_1["auth_id"],
+        )))
+
+        # Verify that the auth metadata file stores the tenant ID that the
+        # auth ID belongs to, the auth ID's authorized access levels
+        # for different volumes, versioning details, etc.
+        expected_auth_metadata = {
+            "version": 2,
+            "compat_version": 6,
+            "dirty": False,
+            "tenant_id": "tenant1",
+            "subvolumes": {
+                "groupid/volumeid1": {
+                    "dirty": False,
+                    "access_level": "rw"
+                }
+            }
+        }
+
+        auth_metadata = self._volume_client_python(volumeclient_mount, dedent("""
+            import json
+            auth_metadata = vc._auth_metadata_get("{auth_id}")
+            print(json.dumps(auth_metadata))
+        """.format(
+            auth_id=guestclient_1["auth_id"],
+        )))
+        auth_metadata = json.loads(auth_metadata)
+
+        self.assertGreaterEqual(auth_metadata["version"], expected_auth_metadata["version"])
+        del expected_auth_metadata["version"]
+        del auth_metadata["version"]
+        self.assertEqual(expected_auth_metadata, auth_metadata)
+
+        # Check that auth metadata file is cleaned up on removing
+        # auth ID's access to volumes 'volumeid1' and 'volumeid2'
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id1,
+            guest_entity=guestclient_1["auth_id"]
+        )))
+        self.assertNotIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Check that volume metadata file is cleaned up on 'volumeid1' deletion.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id1,
+        )))
+        self.assertNotIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Check that volume metadata file is cleaned up on 'volumeid2' deletion.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id2,
+        )))
+        self.assertNotIn(vol_metadata_filename2, self.mounts[0].ls("volumes"))
+
+    def test_put_object(self):
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+        self._configure_vc_auth(vc_mount, "manila")
+
+        obj_data = 'test data'
+        obj_name = 'test_vc_obj_1'
+        pool_name = self.fs.get_data_pool_names()[0]
+
+        self._volume_client_python(vc_mount, dedent("""
+            vc.put_object("{pool_name}", "{obj_name}", b"{obj_data}")
+        """.format(
+            pool_name = pool_name,
+            obj_name = obj_name,
+            obj_data = obj_data
+        )))
+
+        read_data = self.fs.rados(['get', obj_name, '-'], pool=pool_name, stdout=StringIO()).stdout.getvalue()
+        self.assertEqual(obj_data, read_data)
+
+    def test_get_object(self):
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+        self._configure_vc_auth(vc_mount, "manila")
+
+        obj_data = 'test_data'
+        obj_name = 'test_vc_ob_2'
+        pool_name = self.fs.get_data_pool_names()[0]
+
+        self.fs.rados(['put', obj_name, '-'], pool=pool_name, stdin=StringIO(obj_data))
+
+        self._volume_client_python(vc_mount, dedent("""
+            data_read = vc.get_object("{pool_name}", "{obj_name}")
+            assert data_read == b"{obj_data}"
+        """.format(
+            pool_name = pool_name,
+            obj_name = obj_name,
+            obj_data = obj_data
+        )))
+
+    def test_put_object_versioned(self):
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+        self._configure_vc_auth(vc_mount, "manila")
+
+        obj_data = 'test_data'
+        obj_name = 'test_vc_obj'
+        pool_name = self.fs.get_data_pool_names()[0]
+        self.fs.rados(['put', obj_name, '-'], pool=pool_name, stdin=StringIO(obj_data))
+
+        self._volume_client_python(vc_mount, dedent("""
+            data, version_before = vc.get_object_and_version("{pool_name}", "{obj_name}")
+
+            if sys_version_info.major < 3:
+                data = data + 'modification1'
+            elif sys_version_info.major > 3:
+                data = str.encode(data.decode() + 'modification1')
+
+            vc.put_object_versioned("{pool_name}", "{obj_name}", data, version_before)
+            data, version_after = vc.get_object_and_version("{pool_name}", "{obj_name}")
+            assert version_after == version_before + 1
+        """).format(pool_name=pool_name, obj_name=obj_name))
+
+    def test_version_check_for_put_object_versioned(self):
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+        self._configure_vc_auth(vc_mount, "manila")
+
+        obj_data = 'test_data'
+        obj_name = 'test_vc_ob_2'
+        pool_name = self.fs.get_data_pool_names()[0]
+        self.fs.rados(['put', obj_name, '-'], pool=pool_name, stdin=StringIO(obj_data))
+
+        # Test if put_object_versioned() crosschecks the version of the
+        # given object. Being a negative test, an exception is expected.
+        expected_exception = 'rados_OSError'
+        output = self._volume_client_python(vc_mount, dedent("""
+            data, version = vc.get_object_and_version("{pool_name}", "{obj_name}")
+
+            if sys_version_info.major < 3:
+                data = data + 'm1'
+            elif sys_version_info.major > 3:
+                data = str.encode(data.decode('utf-8') + 'm1')
+
+            vc.put_object("{pool_name}", "{obj_name}", data)
+
+            if sys_version_info.major < 3:
+                data = data + 'm2'
+            elif sys_version_info.major > 3:
+                data = str.encode(data.decode('utf-8') + 'm2')
+
+            try:
+                vc.put_object_versioned("{pool_name}", "{obj_name}", data, version)
+            except {expected_exception}:
+                print('{expected_exception} raised')
+        """).format(pool_name=pool_name, obj_name=obj_name,
+                    expected_exception=expected_exception))
+        self.assertEqual(expected_exception + ' raised', output)
+
+
+    def test_delete_object(self):
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+        self._configure_vc_auth(vc_mount, "manila")
+
+        obj_data = 'test data'
+        obj_name = 'test_vc_obj_3'
+        pool_name = self.fs.get_data_pool_names()[0]
+
+        self.fs.rados(['put', obj_name, '-'], pool=pool_name, stdin=StringIO(obj_data))
+
+        self._volume_client_python(vc_mount, dedent("""
+            data_read = vc.delete_object("{pool_name}", "{obj_name}")
+        """.format(
+            pool_name = pool_name,
+            obj_name = obj_name,
+        )))
+
+        with self.assertRaises(CommandFailedError):
+            self.fs.rados(['stat', obj_name], pool=pool_name)
+
+        # Check idempotency -- no error raised trying to delete non-existent
+        # object
+        self._volume_client_python(vc_mount, dedent("""
+            data_read = vc.delete_object("{pool_name}", "{obj_name}")
+        """.format(
+            pool_name = pool_name,
+            obj_name = obj_name,
+        )))
+
+    def test_21501(self):
+        """
+        Reproducer for #21501 "ceph_volume_client: sets invalid caps for
+        existing IDs with no caps" (http://tracker.ceph.com/issues/21501)
+        """
+
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+
+        # Configure vc_mount as the handle for driving volumeclient
+        self._configure_vc_auth(vc_mount, "manila")
+
+        # Create a volume
+        group_id = "grpid"
+        volume_id = "volid"
+        cephfs_mntpt = self._volume_client_python(vc_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 1024*1024*10)
+            print(create_result['mount_path'])
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id
+        )))
+
+        # Create an auth ID with no caps
+        guest_id = '21501'
+        self.fs.mon_manager.raw_cluster_cmd_result(
+            'auth', 'get-or-create', 'client.{0}'.format(guest_id))
+
+        guest_mount = self.mounts[2]
+        guest_mount.umount_wait()
+# Set auth caps for the auth ID using the volumeclient
+        self._configure_guest_auth(vc_mount, guest_mount, guest_id, cephfs_mntpt,
+                                   allow_existing_id=True)
+
+        # Mount the volume in the guest using the auth ID to assert that the
+        # auth caps are valid
+        guest_mount.mount_wait(cephfs_mntpt=cephfs_mntpt)
+
+    def test_volume_without_namespace_isolation(self):
+        """
+        That volume client can create volumes that do not have separate RADOS
+        namespace layouts.
+        """
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+
+        # Configure vc_mount as the handle for driving volumeclient
+        self._configure_vc_auth(vc_mount, "manila")
+
+        # Create a volume
+        volume_prefix = "/myprefix"
+        group_id = "grpid"
+        volume_id = "volid"
+        self._volume_client_python(vc_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 1024*1024*10, namespace_isolated=False)
+            print(create_result['mount_path'])
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id
+        )), volume_prefix)
+
+        # The CephFS volume should be created
+        self.mounts[0].stat(os.path.join("myprefix", group_id, volume_id))
+        vol_namespace = self.mounts[0].getfattr(
+            os.path.join("myprefix", group_id, volume_id),
+            "ceph.dir.layout.pool_namespace")
+        assert not vol_namespace
+
+        self._volume_client_python(vc_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+            vc.purge_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )), volume_prefix)
diff --git a/qa/tasks/cephfs/test_volumes.py b/qa/tasks/cephfs/test_volumes.py
new file mode 100644
index 000000000..7f17fe9e4
--- /dev/null
+++ b/qa/tasks/cephfs/test_volumes.py
@@ -0,0 +1,7807 @@
+import os
+import json
+import time
+import errno
+import random
+import logging
+import collections
+import uuid
+import unittest
+from hashlib import md5
+from textwrap import dedent
+from io import StringIO
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from tasks.cephfs.fuse_mount import FuseMount
+from teuthology.exceptions import CommandFailedError
+
+log = logging.getLogger(__name__)
+
+class TestVolumesHelper(CephFSTestCase):
+    """Helper class for testing FS volume, subvolume group and subvolume operations."""
+    TEST_VOLUME_PREFIX = "volume"
+    TEST_SUBVOLUME_PREFIX="subvolume"
+    TEST_GROUP_PREFIX="group"
+    TEST_SNAPSHOT_PREFIX="snapshot"
+    TEST_CLONE_PREFIX="clone"
+    TEST_FILE_NAME_PREFIX="subvolume_file"
+
+    # for filling subvolume with data
+    CLIENTS_REQUIRED = 2
+    MDSS_REQUIRED = 2
+
+    # io defaults
+    DEFAULT_FILE_SIZE = 1 # MB
+    DEFAULT_NUMBER_OF_FILES = 1024
+
+    def _fs_cmd(self, *args):
+        return self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", *args)
+
+    def _raw_cmd(self, *args):
+        return self.mgr_cluster.mon_manager.raw_cluster_cmd(*args)
+
+    def __check_clone_state(self, state, clone, clone_group=None, timo=120):
+        check = 0
+        args = ["clone", "status", self.volname, clone]
+        if clone_group:
+            args.append(clone_group)
+        args = tuple(args)
+        while check < timo:
+            result = json.loads(self._fs_cmd(*args))
+            if result["status"]["state"] == state:
+                break
+            check += 1
+            time.sleep(1)
+        self.assertTrue(check < timo)
+
+    def _get_clone_status(self, clone, clone_group=None):
+        args = ["clone", "status", self.volname, clone]
+        if clone_group:
+            args.append(clone_group)
+        args = tuple(args)
+        result = json.loads(self._fs_cmd(*args))
+        return result
+
+    def _wait_for_clone_to_complete(self, clone, clone_group=None, timo=120):
+        self.__check_clone_state("complete", clone, clone_group, timo)
+
+    def _wait_for_clone_to_fail(self, clone, clone_group=None, timo=120):
+        self.__check_clone_state("failed", clone, clone_group, timo)
+
+    def _wait_for_clone_to_be_in_progress(self, clone, clone_group=None, timo=120):
+        self.__check_clone_state("in-progress", clone, clone_group, timo)
+
+    def _check_clone_canceled(self, clone, clone_group=None):
+        self.__check_clone_state("canceled", clone, clone_group, timo=1)
+
+    def _get_subvolume_snapshot_path(self, subvolume, snapshot, source_group, subvol_path, source_version):
+        if source_version == 2:
+            # v2
+            if subvol_path is not None:
+                (base_path, uuid_str) = os.path.split(subvol_path)
+            else:
+                (base_path, uuid_str) = os.path.split(self._get_subvolume_path(self.volname, subvolume, group_name=source_group))
+            return os.path.join(base_path, ".snap", snapshot, uuid_str)
+
+        # v1
+        base_path = self._get_subvolume_path(self.volname, subvolume, group_name=source_group)
+        return os.path.join(base_path, ".snap", snapshot)
+
+    def _verify_clone_attrs(self, source_path, clone_path):
+        path1 = source_path
+        path2 = clone_path
+
+        p = self.mount_a.run_shell(["find", path1])
+        paths = p.stdout.getvalue().strip().split()
+
+        # for each entry in source and clone (sink) verify certain inode attributes:
+        # inode type, mode, ownership, [am]time.
+        for source_path in paths:
+            sink_entry = source_path[len(path1)+1:]
+            sink_path = os.path.join(path2, sink_entry)
+
+            # mode+type
+            sval = int(self.mount_a.run_shell(['stat', '-c' '%f', source_path]).stdout.getvalue().strip(), 16)
+            cval = int(self.mount_a.run_shell(['stat', '-c' '%f', sink_path]).stdout.getvalue().strip(), 16)
+            self.assertEqual(sval, cval)
+
+            # ownership
+            sval = int(self.mount_a.run_shell(['stat', '-c' '%u', source_path]).stdout.getvalue().strip())
+            cval = int(self.mount_a.run_shell(['stat', '-c' '%u', sink_path]).stdout.getvalue().strip())
+            self.assertEqual(sval, cval)
+
+            sval = int(self.mount_a.run_shell(['stat', '-c' '%g', source_path]).stdout.getvalue().strip())
+            cval = int(self.mount_a.run_shell(['stat', '-c' '%g', sink_path]).stdout.getvalue().strip())
+            self.assertEqual(sval, cval)
+
+            # inode timestamps
+            # do not check access as kclient will generally not update this like ceph-fuse will.
+            sval = int(self.mount_a.run_shell(['stat', '-c' '%Y', source_path]).stdout.getvalue().strip())
+            cval = int(self.mount_a.run_shell(['stat', '-c' '%Y', sink_path]).stdout.getvalue().strip())
+            self.assertEqual(sval, cval)
+
+    def _verify_clone_root(self, source_path, clone_path, clone, clone_group, clone_pool):
+        # verifies following clone root attrs quota, data_pool and pool_namespace
+        # remaining attributes of clone root are validated in _verify_clone_attrs
+
+        clone_info = json.loads(self._get_subvolume_info(self.volname, clone, clone_group))
+
+        # verify quota is inherited from source snapshot
+        src_quota = self.mount_a.getfattr(source_path, "ceph.quota.max_bytes")
+        # FIXME: kclient fails to get this quota value: https://tracker.ceph.com/issues/48075
+        if isinstance(self.mount_a, FuseMount):
+            self.assertEqual(clone_info["bytes_quota"], "infinite" if src_quota is None else int(src_quota))
+
+        if clone_pool:
+            # verify pool is set as per request
+            self.assertEqual(clone_info["data_pool"], clone_pool)
+        else:
+            # verify pool and pool namespace are inherited from snapshot
+            self.assertEqual(clone_info["data_pool"],
+                             self.mount_a.getfattr(source_path, "ceph.dir.layout.pool"))
+            self.assertEqual(clone_info["pool_namespace"],
+                             self.mount_a.getfattr(source_path, "ceph.dir.layout.pool_namespace"))
+
+    def _verify_clone(self, subvolume, snapshot, clone,
+                      source_group=None, clone_group=None, clone_pool=None,
+                      subvol_path=None, source_version=2, timo=120):
+        # pass in subvol_path (subvolume path when snapshot was taken) when subvolume is removed
+        # but snapshots are retained for clone verification
+        path1 = self._get_subvolume_snapshot_path(subvolume, snapshot, source_group, subvol_path, source_version)
+        path2 = self._get_subvolume_path(self.volname, clone, group_name=clone_group)
+
+        check = 0
+        # TODO: currently snapshot rentries are not stable if snapshot source entries
+        #       are removed, https://tracker.ceph.com/issues/46747
+        while check < timo and subvol_path is None:
+            val1 = int(self.mount_a.getfattr(path1, "ceph.dir.rentries"))
+            val2 = int(self.mount_a.getfattr(path2, "ceph.dir.rentries"))
+            if val1 == val2:
+                break
+            check += 1
+            time.sleep(1)
+        self.assertTrue(check < timo)
+
+        self._verify_clone_root(path1, path2, clone, clone_group, clone_pool)
+        self._verify_clone_attrs(path1, path2)
+
+    def _generate_random_volume_name(self, count=1):
+        n = self.volume_start
+        volumes = [f"{TestVolumes.TEST_VOLUME_PREFIX}_{i:016}" for i in range(n, n+count)]
+        self.volume_start += count
+        return volumes[0] if count == 1 else volumes
+
+    def _generate_random_subvolume_name(self, count=1):
+        n = self.subvolume_start
+        subvolumes = [f"{TestVolumes.TEST_SUBVOLUME_PREFIX}_{i:016}" for i in range(n, n+count)]
+        self.subvolume_start += count
+        return subvolumes[0] if count == 1 else subvolumes
+
+    def _generate_random_group_name(self, count=1):
+        n = self.group_start
+        groups = [f"{TestVolumes.TEST_GROUP_PREFIX}_{i:016}" for i in range(n, n+count)]
+        self.group_start += count
+        return groups[0] if count == 1 else groups
+
+    def _generate_random_snapshot_name(self, count=1):
+        n = self.snapshot_start
+        snaps = [f"{TestVolumes.TEST_SNAPSHOT_PREFIX}_{i:016}" for i in range(n, n+count)]
+        self.snapshot_start += count
+        return snaps[0] if count == 1 else snaps
+
+    def _generate_random_clone_name(self, count=1):
+        n = self.clone_start
+        clones = [f"{TestVolumes.TEST_CLONE_PREFIX}_{i:016}" for i in range(n, n+count)]
+        self.clone_start += count
+        return clones[0] if count == 1 else clones
+
+    def _enable_multi_fs(self):
+        self._fs_cmd("flag", "set", "enable_multiple", "true", "--yes-i-really-mean-it")
+
+    def _create_or_reuse_test_volume(self):
+        result = json.loads(self._fs_cmd("volume", "ls"))
+        if len(result) == 0:
+            self.vol_created = True
+            self.volname = self._generate_random_volume_name()
+            self._fs_cmd("volume", "create", self.volname)
+        else:
+            self.volname = result[0]['name']
+
+    def  _get_volume_info(self, vol_name, human_readable=False):
+        if human_readable:
+            args = ["volume", "info", vol_name, human_readable]
+        else:
+            args = ["volume", "info", vol_name]
+        args = tuple(args)
+        vol_md = self._fs_cmd(*args)
+        return vol_md
+
+    def  _get_subvolume_group_path(self, vol_name, group_name):
+        args = ("subvolumegroup", "getpath", vol_name, group_name)
+        path = self._fs_cmd(*args)
+        # remove the leading '/', and trailing whitespaces
+        return path[1:].rstrip()
+
+    def  _get_subvolume_group_info(self, vol_name, group_name):
+        args = ["subvolumegroup", "info", vol_name, group_name]
+        args = tuple(args)
+        group_md = self._fs_cmd(*args)
+        return group_md
+
+    def  _get_subvolume_path(self, vol_name, subvol_name, group_name=None):
+        args = ["subvolume", "getpath", vol_name, subvol_name]
+        if group_name:
+            args.append(group_name)
+        args = tuple(args)
+        path = self._fs_cmd(*args)
+        # remove the leading '/', and trailing whitespaces
+        return path[1:].rstrip()
+
+    def  _get_subvolume_info(self, vol_name, subvol_name, group_name=None):
+        args = ["subvolume", "info", vol_name, subvol_name]
+        if group_name:
+            args.append(group_name)
+        args = tuple(args)
+        subvol_md = self._fs_cmd(*args)
+        return subvol_md
+
+    def _get_subvolume_snapshot_info(self, vol_name, subvol_name, snapname, group_name=None):
+        args = ["subvolume", "snapshot", "info", vol_name, subvol_name, snapname]
+        if group_name:
+            args.append(group_name)
+        args = tuple(args)
+        snap_md = self._fs_cmd(*args)
+        return snap_md
+
+    def _delete_test_volume(self):
+        self._fs_cmd("volume", "rm", self.volname, "--yes-i-really-mean-it")
+
+    def _do_subvolume_pool_and_namespace_update(self, subvolume, pool=None, pool_namespace=None, subvolume_group=None):
+        subvolpath = self._get_subvolume_path(self.volname, subvolume, group_name=subvolume_group)
+
+        if pool is not None:
+            self.mount_a.setfattr(subvolpath, 'ceph.dir.layout.pool', pool, sudo=True)
+
+        if pool_namespace is not None:
+            self.mount_a.setfattr(subvolpath, 'ceph.dir.layout.pool_namespace', pool_namespace, sudo=True)
+
+    def _do_subvolume_attr_update(self, subvolume, uid, gid, mode, subvolume_group=None):
+        subvolpath = self._get_subvolume_path(self.volname, subvolume, group_name=subvolume_group)
+
+        # mode
+        self.mount_a.run_shell(['chmod', mode, subvolpath], sudo=True)
+
+        # ownership
+        self.mount_a.run_shell(['chown', uid, subvolpath], sudo=True)
+        self.mount_a.run_shell(['chgrp', gid, subvolpath], sudo=True)
+
+    def _do_subvolume_io(self, subvolume, subvolume_group=None, create_dir=None,
+                         number_of_files=DEFAULT_NUMBER_OF_FILES, file_size=DEFAULT_FILE_SIZE):
+        # get subvolume path for IO
+        args = ["subvolume", "getpath", self.volname, subvolume]
+        if subvolume_group:
+            args.append(subvolume_group)
+        args = tuple(args)
+        subvolpath = self._fs_cmd(*args)
+        self.assertNotEqual(subvolpath, None)
+        subvolpath = subvolpath[1:].rstrip() # remove "/" prefix and any trailing newline
+
+        io_path = subvolpath
+        if create_dir:
+            io_path = os.path.join(subvolpath, create_dir)
+            self.mount_a.run_shell_payload(f"mkdir -p {io_path}")
+
+        log.debug("filling subvolume {0} with {1} files each {2}MB size under directory {3}".format(subvolume, number_of_files, file_size, io_path))
+        for i in range(number_of_files):
+            filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, i)
+            self.mount_a.write_n_mb(os.path.join(io_path, filename), file_size)
+
+    def _do_subvolume_io_mixed(self, subvolume, subvolume_group=None):
+        subvolpath = self._get_subvolume_path(self.volname, subvolume, group_name=subvolume_group)
+
+        reg_file = "regfile.0"
+        dir_path = os.path.join(subvolpath, "dir.0")
+        sym_path1 = os.path.join(subvolpath, "sym.0")
+        # this symlink's ownership would be changed
+        sym_path2 = os.path.join(dir_path, "sym.0")
+
+        self.mount_a.run_shell(["mkdir", dir_path])
+        self.mount_a.run_shell(["ln", "-s", "./{}".format(reg_file), sym_path1])
+        self.mount_a.run_shell(["ln", "-s", "./{}".format(reg_file), sym_path2])
+        # flip ownership to nobody. assumption: nobody's id is 65534
+        self.mount_a.run_shell(["chown", "-h", "65534:65534", sym_path2], sudo=True, omit_sudo=False)
+
+    def _wait_for_trash_empty(self, timeout=60):
+        # XXX: construct the trash dir path (note that there is no mgr
+        # [sub]volume interface for this).
+        trashdir = os.path.join("./", "volumes", "_deleting")
+        self.mount_a.wait_for_dir_empty(trashdir, timeout=timeout)
+
+    def _wait_for_subvol_trash_empty(self, subvol, group="_nogroup", timeout=30):
+        trashdir = os.path.join("./", "volumes", group, subvol, ".trash")
+        try:
+            self.mount_a.wait_for_dir_empty(trashdir, timeout=timeout)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOENT:
+                pass
+            else:
+                raise
+
+    def _assert_meta_location_and_version(self, vol_name, subvol_name, subvol_group=None, version=2, legacy=False):
+        if legacy:
+            subvol_path = self._get_subvolume_path(vol_name, subvol_name, group_name=subvol_group)
+            m = md5()
+            m.update(("/"+subvol_path).encode('utf-8'))
+            meta_filename = "{0}.meta".format(m.digest().hex())
+            metapath = os.path.join(".", "volumes", "_legacy", meta_filename)
+        else:
+            group = subvol_group if subvol_group is not None else '_nogroup'
+            metapath = os.path.join(".", "volumes", group, subvol_name, ".meta")
+
+        out = self.mount_a.run_shell(['cat', metapath], sudo=True)
+        lines = out.stdout.getvalue().strip().split('\n')
+        sv_version = -1
+        for line in lines:
+            if line == "version = " + str(version):
+                sv_version = version
+                break
+        self.assertEqual(sv_version, version, "version expected was '{0}' but got '{1}' from meta file at '{2}'".format(
+                         version, sv_version, metapath))
+
+    def _create_v1_subvolume(self, subvol_name, subvol_group=None, has_snapshot=True, subvol_type='subvolume', state='complete'):
+        group = subvol_group if subvol_group is not None else '_nogroup'
+        basepath = os.path.join("volumes", group, subvol_name)
+        uuid_str = str(uuid.uuid4())
+        createpath = os.path.join(basepath, uuid_str)
+        self.mount_a.run_shell(['mkdir', '-p', createpath], sudo=True)
+
+        # create a v1 snapshot, to prevent auto upgrades
+        if has_snapshot:
+            snappath = os.path.join(createpath, ".snap", "fake")
+            self.mount_a.run_shell(['mkdir', '-p', snappath], sudo=True)
+
+        # add required xattrs to subvolume
+        default_pool = self.mount_a.getfattr(".", "ceph.dir.layout.pool")
+        self.mount_a.setfattr(createpath, 'ceph.dir.layout.pool', default_pool, sudo=True)
+
+        # create a v1 .meta file
+        meta_contents = "[GLOBAL]\nversion = 1\ntype = {0}\npath = {1}\nstate = {2}\n".format(subvol_type, "/" + createpath, state)
+        if state == 'pending':
+            # add a fake clone source
+            meta_contents = meta_contents + '[source]\nvolume = fake\nsubvolume = fake\nsnapshot = fake\n'
+        meta_filepath1 = os.path.join(self.mount_a.mountpoint, basepath, ".meta")
+        self.mount_a.client_remote.write_file(meta_filepath1, meta_contents, sudo=True)
+        return createpath
+
+    def _update_fake_trash(self, subvol_name, subvol_group=None, trash_name='fake', create=True):
+        group = subvol_group if subvol_group is not None else '_nogroup'
+        trashpath = os.path.join("volumes", group, subvol_name, '.trash', trash_name)
+        if create:
+            self.mount_a.run_shell(['mkdir', '-p', trashpath], sudo=True)
+        else:
+            self.mount_a.run_shell(['rmdir', trashpath], sudo=True)
+
+    def _configure_guest_auth(self, guest_mount, authid, key):
+        """
+        Set up auth credentials for a guest client.
+        """
+        # Create keyring file for the guest client.
+        keyring_txt = dedent("""
+        [client.{authid}]
+            key = {key}
+
+        """.format(authid=authid,key=key))
+
+        guest_mount.client_id = authid
+        guest_mount.client_remote.write_file(guest_mount.get_keyring_path(),
+                                             keyring_txt, sudo=True)
+        # Add a guest client section to the ceph config file.
+        self.config_set("client.{0}".format(authid), "debug client", 20)
+        self.config_set("client.{0}".format(authid), "debug objecter", 20)
+        self.set_conf("client.{0}".format(authid),
+                      "keyring", guest_mount.get_keyring_path())
+
+    def _auth_metadata_get(self, filedata):
+        """
+        Return a deserialized JSON object, or None
+        """
+        try:
+            data = json.loads(filedata)
+        except json.decoder.JSONDecodeError:
+            data = None
+        return data
+
+    def setUp(self):
+        super(TestVolumesHelper, self).setUp()
+        self.volname = None
+        self.vol_created = False
+        self._enable_multi_fs()
+        self._create_or_reuse_test_volume()
+        self.config_set('mon', 'mon_allow_pool_delete', True)
+        self.volume_start = random.randint(1, (1<<20))
+        self.subvolume_start = random.randint(1, (1<<20))
+        self.group_start = random.randint(1, (1<<20))
+        self.snapshot_start = random.randint(1, (1<<20))
+        self.clone_start = random.randint(1, (1<<20))
+
+    def tearDown(self):
+        if self.vol_created:
+            self._delete_test_volume()
+        super(TestVolumesHelper, self).tearDown()
+
+
+class TestVolumes(TestVolumesHelper):
+    """Tests for FS volume operations."""
+    def test_volume_create(self):
+        """
+        That the volume can be created and then cleans up
+        """
+        volname = self._generate_random_volume_name()
+        self._fs_cmd("volume", "create", volname)
+        volumels = json.loads(self._fs_cmd("volume", "ls"))
+
+        if not (volname in ([volume['name'] for volume in volumels])):
+            raise RuntimeError("Error creating volume '{0}'".format(volname))
+        else:
+            # clean up
+            self._fs_cmd("volume", "rm", volname, "--yes-i-really-mean-it")
+
+    def test_volume_ls(self):
+        """
+        That the existing and the newly created volumes can be listed and
+        finally cleans up.
+        """
+        vls = json.loads(self._fs_cmd("volume", "ls"))
+        volumes = [volume['name'] for volume in vls]
+
+        #create new volumes and add it to the existing list of volumes
+        volumenames = self._generate_random_volume_name(2)
+        for volumename in volumenames:
+            self._fs_cmd("volume", "create", volumename)
+        volumes.extend(volumenames)
+
+        # list volumes
+        try:
+            volumels = json.loads(self._fs_cmd('volume', 'ls'))
+            if len(volumels) == 0:
+                raise RuntimeError("Expected the 'fs volume ls' command to list the created volumes.")
+            else:
+                volnames = [volume['name'] for volume in volumels]
+                if collections.Counter(volnames) != collections.Counter(volumes):
+                    raise RuntimeError("Error creating or listing volumes")
+        finally:
+            # clean up
+            for volume in volumenames:
+                self._fs_cmd("volume", "rm", volume, "--yes-i-really-mean-it")
+
+    def test_volume_rm(self):
+        """
+        That the volume can only be removed when --yes-i-really-mean-it is used
+        and verify that the deleted volume is not listed anymore.
+        """
+        for m in self.mounts:
+            m.umount_wait()
+        try:
+            self._fs_cmd("volume", "rm", self.volname)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EPERM:
+                raise RuntimeError("expected the 'fs volume rm' command to fail with EPERM, "
+                                   "but it failed with {0}".format(ce.exitstatus))
+            else:
+                self._fs_cmd("volume", "rm", self.volname, "--yes-i-really-mean-it")
+
+                #check if it's gone
+                volumes = json.loads(self._fs_cmd("volume", "ls", "--format=json-pretty"))
+                if (self.volname in [volume['name'] for volume in volumes]):
+                    raise RuntimeError("Expected the 'fs volume rm' command to succeed. "
+                                       "The volume {0} not removed.".format(self.volname))
+        else:
+            raise RuntimeError("expected the 'fs volume rm' command to fail.")
+
+    def test_volume_rm_arbitrary_pool_removal(self):
+        """
+        That the arbitrary pool added to the volume out of band is removed
+        successfully on volume removal.
+        """
+        for m in self.mounts:
+            m.umount_wait()
+        new_pool = "new_pool"
+        # add arbitrary data pool
+        self.fs.add_data_pool(new_pool)
+        vol_status = json.loads(self._fs_cmd("status", self.volname, "--format=json-pretty"))
+        self._fs_cmd("volume", "rm", self.volname, "--yes-i-really-mean-it")
+
+        #check if fs is gone
+        volumes = json.loads(self._fs_cmd("volume", "ls", "--format=json-pretty"))
+        volnames = [volume['name'] for volume in volumes]
+        self.assertNotIn(self.volname, volnames)
+
+        #check if osd pools are gone
+        pools = json.loads(self._raw_cmd("osd", "pool", "ls", "--format=json-pretty"))
+        for pool in vol_status["pools"]:
+            self.assertNotIn(pool["name"], pools)
+
+    def test_volume_rm_when_mon_delete_pool_false(self):
+        """
+        That the volume can only be removed when mon_allowd_pool_delete is set
+        to true and verify that the pools are removed after volume deletion.
+        """
+        for m in self.mounts:
+            m.umount_wait()
+        self.config_set('mon', 'mon_allow_pool_delete', False)
+        try:
+            self._fs_cmd("volume", "rm", self.volname, "--yes-i-really-mean-it")
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EPERM,
+                             "expected the 'fs volume rm' command to fail with EPERM, "
+                             "but it failed with {0}".format(ce.exitstatus))
+        vol_status = json.loads(self._fs_cmd("status", self.volname, "--format=json-pretty"))
+        self.config_set('mon', 'mon_allow_pool_delete', True)
+        self._fs_cmd("volume", "rm", self.volname, "--yes-i-really-mean-it")
+
+        #check if fs is gone
+        volumes = json.loads(self._fs_cmd("volume", "ls", "--format=json-pretty"))
+        volnames = [volume['name'] for volume in volumes]
+        self.assertNotIn(self.volname, volnames,
+                         "volume {0} exists after removal".format(self.volname))
+        #check if pools are gone
+        pools = json.loads(self._raw_cmd("osd", "pool", "ls", "--format=json-pretty"))
+        for pool in vol_status["pools"]:
+            self.assertNotIn(pool["name"], pools,
+                             "pool {0} exists after volume removal".format(pool["name"]))
+
+    def test_volume_info(self):
+        """
+        Tests the 'fs volume info' command
+        """
+        vol_fields = ["pools", "used_size", "pending_subvolume_deletions", "mon_addrs"]
+        group = self._generate_random_group_name()
+        # create subvolumegroup
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+        # get volume metadata
+        vol_info = json.loads(self._get_volume_info(self.volname))
+        for md in vol_fields:
+            self.assertIn(md, vol_info,
+                          f"'{md}' key not present in metadata of volume")
+        self.assertEqual(vol_info["used_size"], 0,
+                         "Size should be zero when volumes directory is empty")
+
+    def test_volume_info_without_subvolumegroup(self):
+        """
+        Tests the 'fs volume info' command without subvolume group
+        """
+        vol_fields = ["pools", "mon_addrs"]
+        # get volume metadata
+        vol_info = json.loads(self._get_volume_info(self.volname))
+        for md in vol_fields:
+            self.assertIn(md, vol_info,
+                          f"'{md}' key not present in metadata of volume")
+        self.assertNotIn("used_size", vol_info,
+                         "'used_size' should not be present in absence of subvolumegroup")
+        self.assertNotIn("pending_subvolume_deletions", vol_info,
+                         "'pending_subvolume_deletions' should not be present in absence"
+                         " of subvolumegroup")
+
+    def test_volume_info_with_human_readable_flag(self):
+        """
+        Tests the 'fs volume info --human_readable' command
+        """
+        vol_fields = ["pools", "used_size", "pending_subvolume_deletions", "mon_addrs"]
+        group = self._generate_random_group_name()
+        # create subvolumegroup
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+        # get volume metadata
+        vol_info = json.loads(self._get_volume_info(self.volname, "--human_readable"))
+        for md in vol_fields:
+            self.assertIn(md, vol_info,
+                          f"'{md}' key not present in metadata of volume")
+        units = [' ', 'k', 'M', 'G', 'T', 'P', 'E']
+        assert vol_info["used_size"][-1] in units, "unit suffix in used_size is absent"
+        assert vol_info["pools"]["data"][0]["avail"][-1] in units, "unit suffix in avail data is absent"
+        assert vol_info["pools"]["data"][0]["used"][-1] in units, "unit suffix in used data is absent"
+        assert vol_info["pools"]["metadata"][0]["avail"][-1] in units, "unit suffix in avail metadata is absent"
+        assert vol_info["pools"]["metadata"][0]["used"][-1] in units, "unit suffix in used metadata is absent"
+        self.assertEqual(int(vol_info["used_size"]), 0,
+                         "Size should be zero when volumes directory is empty")
+
+    def test_volume_info_with_human_readable_flag_without_subvolumegroup(self):
+        """
+        Tests the 'fs volume info --human_readable' command without subvolume group
+        """
+        vol_fields = ["pools", "mon_addrs"]
+        # get volume metadata
+        vol_info = json.loads(self._get_volume_info(self.volname, "--human_readable"))
+        for md in vol_fields:
+            self.assertIn(md, vol_info,
+                          f"'{md}' key not present in metadata of volume")
+        units = [' ', 'k', 'M', 'G', 'T', 'P', 'E']
+        assert vol_info["pools"]["data"][0]["avail"][-1] in units, "unit suffix in avail data is absent"
+        assert vol_info["pools"]["data"][0]["used"][-1] in units, "unit suffix in used data is absent"
+        assert vol_info["pools"]["metadata"][0]["avail"][-1] in units, "unit suffix in avail metadata is absent"
+        assert vol_info["pools"]["metadata"][0]["used"][-1] in units, "unit suffix in used metadata is absent"
+        self.assertNotIn("used_size", vol_info,
+                         "'used_size' should not be present in absence of subvolumegroup")
+        self.assertNotIn("pending_subvolume_deletions", vol_info,
+                         "'pending_subvolume_deletions' should not be present in absence"
+                         " of subvolumegroup")
+
+
+class TestSubvolumeGroups(TestVolumesHelper):
+    """Tests for FS subvolume group operations."""
+    def test_default_uid_gid_subvolume_group(self):
+        group = self._generate_random_group_name()
+        expected_uid = 0
+        expected_gid = 0
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+        group_path = self._get_subvolume_group_path(self.volname, group)
+
+        # check group's uid and gid
+        stat = self.mount_a.stat(group_path)
+        self.assertEqual(stat['st_uid'], expected_uid)
+        self.assertEqual(stat['st_gid'], expected_gid)
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_nonexistent_subvolume_group_create(self):
+        subvolume = self._generate_random_subvolume_name()
+        group = "non_existent_group"
+
+        # try, creating subvolume in a nonexistent group
+        try:
+            self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOENT:
+                raise
+        else:
+            raise RuntimeError("expected the 'fs subvolume create' command to fail")
+
+    def test_nonexistent_subvolume_group_rm(self):
+        group = "non_existent_group"
+
+        # try, remove subvolume group
+        try:
+            self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOENT:
+                raise
+        else:
+            raise RuntimeError("expected the 'fs subvolumegroup rm' command to fail")
+
+    def test_subvolume_group_create_with_auto_cleanup_on_fail(self):
+        group = self._generate_random_group_name()
+        data_pool = "invalid_pool"
+        # create group with invalid data pool layout
+        with self.assertRaises(CommandFailedError):
+            self._fs_cmd("subvolumegroup", "create", self.volname, group, "--pool_layout", data_pool)
+
+        # check whether group path is cleaned up
+        try:
+            self._fs_cmd("subvolumegroup", "getpath", self.volname, group)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOENT:
+                raise
+        else:
+            raise RuntimeError("expected the 'fs subvolumegroup getpath' command to fail")
+
+    def test_subvolume_group_create_with_desired_data_pool_layout(self):
+        group1, group2 = self._generate_random_group_name(2)
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group1)
+        group1_path = self._get_subvolume_group_path(self.volname, group1)
+
+        default_pool = self.mount_a.getfattr(group1_path, "ceph.dir.layout.pool")
+        new_pool = "new_pool"
+        self.assertNotEqual(default_pool, new_pool)
+
+        # add data pool
+        newid = self.fs.add_data_pool(new_pool)
+
+        # create group specifying the new data pool as its pool layout
+        self._fs_cmd("subvolumegroup", "create", self.volname, group2,
+                     "--pool_layout", new_pool)
+        group2_path = self._get_subvolume_group_path(self.volname, group2)
+
+        desired_pool = self.mount_a.getfattr(group2_path, "ceph.dir.layout.pool")
+        try:
+            self.assertEqual(desired_pool, new_pool)
+        except AssertionError:
+            self.assertEqual(int(desired_pool), newid) # old kernel returns id
+
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group1)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group2)
+
+    def test_subvolume_group_create_with_desired_mode(self):
+        group1, group2 = self._generate_random_group_name(2)
+        # default mode
+        expected_mode1 = "755"
+        # desired mode
+        expected_mode2 = "777"
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group2, f"--mode={expected_mode2}")
+        self._fs_cmd("subvolumegroup", "create", self.volname, group1)
+
+        group1_path = self._get_subvolume_group_path(self.volname, group1)
+        group2_path = self._get_subvolume_group_path(self.volname, group2)
+        volumes_path = os.path.dirname(group1_path)
+
+        # check group's mode
+        actual_mode1 = self.mount_a.run_shell(['stat', '-c' '%a', group1_path]).stdout.getvalue().strip()
+        actual_mode2 = self.mount_a.run_shell(['stat', '-c' '%a', group2_path]).stdout.getvalue().strip()
+        actual_mode3 = self.mount_a.run_shell(['stat', '-c' '%a', volumes_path]).stdout.getvalue().strip()
+        self.assertEqual(actual_mode1, expected_mode1)
+        self.assertEqual(actual_mode2, expected_mode2)
+        self.assertEqual(actual_mode3, expected_mode1)
+
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group1)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group2)
+
+    def test_subvolume_group_create_with_desired_uid_gid(self):
+        """
+        That the subvolume group can be created with the desired uid and gid and its uid and gid matches the
+        expected values.
+        """
+        uid = 1000
+        gid = 1000
+
+        # create subvolume group
+        subvolgroupname = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, subvolgroupname, "--uid", str(uid), "--gid", str(gid))
+
+        # make sure it exists
+        subvolgrouppath = self._get_subvolume_group_path(self.volname, subvolgroupname)
+        self.assertNotEqual(subvolgrouppath, None)
+
+        # verify the uid and gid
+        suid = int(self.mount_a.run_shell(['stat', '-c' '%u', subvolgrouppath]).stdout.getvalue().strip())
+        sgid = int(self.mount_a.run_shell(['stat', '-c' '%g', subvolgrouppath]).stdout.getvalue().strip())
+        self.assertEqual(uid, suid)
+        self.assertEqual(gid, sgid)
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, subvolgroupname)
+
+    def test_subvolume_group_create_with_invalid_data_pool_layout(self):
+        group = self._generate_random_group_name()
+        data_pool = "invalid_pool"
+        # create group with invalid data pool layout
+        try:
+            self._fs_cmd("subvolumegroup", "create", self.volname, group, "--pool_layout", data_pool)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EINVAL:
+                raise
+        else:
+            raise RuntimeError("expected the 'fs subvolumegroup create' command to fail")
+
+    def test_subvolume_group_create_with_size(self):
+        # create group with size -- should set quota
+        group = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, group, "1000000000")
+
+        # get group metadata
+        group_info = json.loads(self._get_subvolume_group_info(self.volname, group))
+        self.assertEqual(group_info["bytes_quota"], 1000000000)
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_group_info(self):
+        # tests the 'fs subvolumegroup info' command
+
+        group_md = ["atime", "bytes_pcent", "bytes_quota", "bytes_used", "created_at", "ctime",
+                     "data_pool", "gid", "mode", "mon_addrs", "mtime", "uid"]
+
+        # create group
+        group = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # get group metadata
+        group_info = json.loads(self._get_subvolume_group_info(self.volname, group))
+        for md in group_md:
+            self.assertIn(md, group_info, "'{0}' key not present in metadata of group".format(md))
+
+        self.assertEqual(group_info["bytes_pcent"], "undefined", "bytes_pcent should be set to undefined if quota is not set")
+        self.assertEqual(group_info["bytes_quota"], "infinite", "bytes_quota should be set to infinite if quota is not set")
+        self.assertEqual(group_info["uid"], 0)
+        self.assertEqual(group_info["gid"], 0)
+
+        nsize = self.DEFAULT_FILE_SIZE*1024*1024
+        self._fs_cmd("subvolumegroup", "resize", self.volname, group, str(nsize))
+
+        # get group metadata after quota set
+        group_info = json.loads(self._get_subvolume_group_info(self.volname, group))
+        for md in group_md:
+            self.assertIn(md, group_info, "'{0}' key not present in metadata of subvolume".format(md))
+
+        self.assertNotEqual(group_info["bytes_pcent"], "undefined", "bytes_pcent should not be set to undefined if quota is set")
+        self.assertEqual(group_info["bytes_quota"], nsize, "bytes_quota should be set to '{0}'".format(nsize))
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_group_create_idempotence(self):
+        # create group
+        group = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # try creating w/ same subvolume group name -- should be idempotent
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_group_create_idempotence_mode(self):
+        # create group
+        group = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # try creating w/ same subvolume group name with mode -- should set mode
+        self._fs_cmd("subvolumegroup", "create", self.volname, group, "--mode=766")
+
+        group_path = self._get_subvolume_group_path(self.volname, group)
+
+        # check subvolumegroup's  mode
+        mode = self.mount_a.run_shell(['stat', '-c' '%a', group_path]).stdout.getvalue().strip()
+        self.assertEqual(mode, "766")
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_group_create_idempotence_uid_gid(self):
+        desired_uid = 1000
+        desired_gid = 1000
+
+        # create group
+        group = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # try creating w/ same subvolume group name with uid/gid -- should set uid/gid
+        self._fs_cmd("subvolumegroup", "create", self.volname, group, "--uid", str(desired_uid), "--gid", str(desired_gid))
+
+        group_path = self._get_subvolume_group_path(self.volname, group)
+
+        # verify the uid and gid
+        actual_uid = int(self.mount_a.run_shell(['stat', '-c' '%u', group_path]).stdout.getvalue().strip())
+        actual_gid = int(self.mount_a.run_shell(['stat', '-c' '%g', group_path]).stdout.getvalue().strip())
+        self.assertEqual(desired_uid, actual_uid)
+        self.assertEqual(desired_gid, actual_gid)
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_group_create_idempotence_data_pool(self):
+        # create group
+        group = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        group_path = self._get_subvolume_group_path(self.volname, group)
+
+        default_pool = self.mount_a.getfattr(group_path, "ceph.dir.layout.pool")
+        new_pool = "new_pool"
+        self.assertNotEqual(default_pool, new_pool)
+
+        # add data pool
+        newid = self.fs.add_data_pool(new_pool)
+
+        # try creating w/ same subvolume group name with new data pool -- should set pool
+        self._fs_cmd("subvolumegroup", "create", self.volname, group, "--pool_layout", new_pool)
+        desired_pool = self.mount_a.getfattr(group_path, "ceph.dir.layout.pool")
+        try:
+            self.assertEqual(desired_pool, new_pool)
+        except AssertionError:
+            self.assertEqual(int(desired_pool), newid) # old kernel returns id
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_group_create_idempotence_resize(self):
+        # create group
+        group = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # try creating w/ same subvolume name with size -- should set quota
+        self._fs_cmd("subvolumegroup", "create", self.volname, group, "1000000000")
+
+        # get group metadata
+        group_info = json.loads(self._get_subvolume_group_info(self.volname, group))
+        self.assertEqual(group_info["bytes_quota"], 1000000000)
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_group_quota_mds_path_restriction_to_group_path(self):
+        """
+        Tests subvolumegroup quota enforcement with mds path restriction set to group.
+        For quota to be enforced, read permission needs to be provided to the parent
+        of the directory on which quota is set. Please see the tracker comment [1]
+        [1] https://tracker.ceph.com/issues/55090#note-8
+        """
+        osize = self.DEFAULT_FILE_SIZE*1024*1024*100
+        # create group with 100MB quota
+        group = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, group,
+                     "--size", str(osize), "--mode=777")
+
+        # make sure it exists
+        grouppath = self._get_subvolume_group_path(self.volname, group)
+        self.assertNotEqual(grouppath, None)
+
+        # create subvolume under the group
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname,
+                     "--group_name", group, "--mode=777")
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname, group_name=group)
+        self.assertNotEqual(subvolpath, None)
+
+        # Create auth_id
+        authid = "client.guest1"
+        user = json.loads(self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "get-or-create", authid,
+            "mds", "allow rw path=/volumes",
+            "mgr", "allow rw",
+            "osd", "allow rw tag cephfs *=*",
+            "mon", "allow r",
+            "--format=json-pretty"
+            ))
+
+        # Prepare guest_mount with new authid
+        guest_mount = self.mount_b
+        guest_mount.umount_wait()
+
+        # configure credentials for guest client
+        self._configure_guest_auth(guest_mount, "guest1", user[0]["key"])
+
+        # mount the subvolume
+        mount_path = os.path.join("/", subvolpath)
+        guest_mount.mount_wait(cephfs_mntpt=mount_path)
+
+        # create 99 files of 1MB
+        guest_mount.run_shell_payload("mkdir -p dir1")
+        for i in range(99):
+            filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, i)
+            guest_mount.write_n_mb(os.path.join("dir1", filename), self.DEFAULT_FILE_SIZE)
+        try:
+            # write two files of 1MB file to exceed the quota
+            guest_mount.run_shell_payload("mkdir -p dir2")
+            for i in range(2):
+                filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, i)
+                guest_mount.write_n_mb(os.path.join("dir2", filename), self.DEFAULT_FILE_SIZE)
+            # For quota to be enforced
+            time.sleep(60)
+            # create 400 files of 1MB to exceed quota
+            for i in range(400):
+                filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, i)
+                guest_mount.write_n_mb(os.path.join("dir2", filename), self.DEFAULT_FILE_SIZE)
+                # Sometimes quota enforcement takes time.
+                if i == 200:
+                    time.sleep(60)
+        except CommandFailedError:
+            pass
+        else:
+            self.fail(f"expected filling subvolume {subvolname} with 400 files of size 1MB to fail")
+
+        # clean up
+        guest_mount.umount_wait()
+
+        # Delete the subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, "--group_name", group)
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_group_quota_mds_path_restriction_to_subvolume_path(self):
+        """
+        Tests subvolumegroup quota enforcement with mds path restriction set to subvolume path
+        The quota should not be enforced because of the fourth limitation mentioned at
+        https://docs.ceph.com/en/latest/cephfs/quota/#limitations
+        """
+        osize = self.DEFAULT_FILE_SIZE*1024*1024*100
+        # create group with 100MB quota
+        group = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, group,
+                     "--size", str(osize), "--mode=777")
+
+        # make sure it exists
+        grouppath = self._get_subvolume_group_path(self.volname, group)
+        self.assertNotEqual(grouppath, None)
+
+        # create subvolume under the group
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname,
+                     "--group_name", group, "--mode=777")
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname, group_name=group)
+        self.assertNotEqual(subvolpath, None)
+
+        mount_path = os.path.join("/", subvolpath)
+
+        # Create auth_id
+        authid = "client.guest1"
+        user = json.loads(self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "get-or-create", authid,
+            "mds", f"allow rw path={mount_path}",
+            "mgr", "allow rw",
+            "osd", "allow rw tag cephfs *=*",
+            "mon", "allow r",
+            "--format=json-pretty"
+            ))
+
+        # Prepare guest_mount with new authid
+        guest_mount = self.mount_b
+        guest_mount.umount_wait()
+
+        # configure credentials for guest client
+        self._configure_guest_auth(guest_mount, "guest1", user[0]["key"])
+
+        # mount the subvolume
+        guest_mount.mount_wait(cephfs_mntpt=mount_path)
+
+        # create 99 files of 1MB to exceed quota
+        guest_mount.run_shell_payload("mkdir -p dir1")
+        for i in range(99):
+            filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, i)
+            guest_mount.write_n_mb(os.path.join("dir1", filename), self.DEFAULT_FILE_SIZE)
+        try:
+            # write two files of 1MB file to exceed the quota
+            guest_mount.run_shell_payload("mkdir -p dir2")
+            for i in range(2):
+                filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, i)
+                guest_mount.write_n_mb(os.path.join("dir2", filename), self.DEFAULT_FILE_SIZE)
+            # For quota to be enforced
+            time.sleep(60)
+            # create 400 files of 1MB to exceed quota
+            for i in range(400):
+                filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, i)
+                guest_mount.write_n_mb(os.path.join("dir2", filename), self.DEFAULT_FILE_SIZE)
+                # Sometimes quota enforcement takes time.
+                if i == 200:
+                    time.sleep(60)
+        except CommandFailedError:
+            self.fail(f"Quota should not be enforced, expected filling subvolume {subvolname} with 400 files of size 1MB to succeed")
+
+        # clean up
+        guest_mount.umount_wait()
+
+        # Delete the subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, "--group_name", group)
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_group_quota_exceeded_subvolume_removal(self):
+        """
+        Tests subvolume removal if it's group quota is exceeded
+        """
+        osize = self.DEFAULT_FILE_SIZE*1024*1024*100
+        # create group with 100MB quota
+        group = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, group,
+                     "--size", str(osize), "--mode=777")
+
+        # make sure it exists
+        grouppath = self._get_subvolume_group_path(self.volname, group)
+        self.assertNotEqual(grouppath, None)
+
+        # create subvolume under the group
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname,
+                     "--group_name", group, "--mode=777")
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname, group_name=group)
+        self.assertNotEqual(subvolpath, None)
+
+        # create 99 files of 1MB to exceed quota
+        self._do_subvolume_io(subvolname, subvolume_group=group, number_of_files=99)
+
+        try:
+            # write two files of 1MB file to exceed the quota
+            self._do_subvolume_io(subvolname, subvolume_group=group, create_dir='dir1', number_of_files=2)
+            # For quota to be enforced
+            time.sleep(20)
+            # create 400 files of 1MB to exceed quota
+            self._do_subvolume_io(subvolname, subvolume_group=group, create_dir='dir1', number_of_files=400)
+        except CommandFailedError:
+            # Delete subvolume when group quota is exceeded
+            self._fs_cmd("subvolume", "rm", self.volname, subvolname, "--group_name", group)
+        else:
+            self.fail(f"expected filling subvolume {subvolname} with 400 files of size 1MB to fail")
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_group_quota_exceeded_subvolume_removal_retained_snaps(self):
+        """
+        Tests retained snapshot subvolume removal if it's group quota is exceeded
+        """
+        group = self._generate_random_group_name()
+        subvolname = self._generate_random_subvolume_name()
+        snapshot1, snapshot2 = self._generate_random_snapshot_name(2)
+
+        osize = self.DEFAULT_FILE_SIZE*1024*1024*100
+        # create group with 100MB quota
+        self._fs_cmd("subvolumegroup", "create", self.volname, group,
+                     "--size", str(osize), "--mode=777")
+
+        # make sure it exists
+        grouppath = self._get_subvolume_group_path(self.volname, group)
+        self.assertNotEqual(grouppath, None)
+
+        # create subvolume under the group
+        self._fs_cmd("subvolume", "create", self.volname, subvolname,
+                     "--group_name", group, "--mode=777")
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname, group_name=group)
+        self.assertNotEqual(subvolpath, None)
+
+        # create 99 files of 1MB to exceed quota
+        self._do_subvolume_io(subvolname, subvolume_group=group, number_of_files=99)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot1, "--group_name", group)
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot2, "--group_name", group)
+
+        try:
+            # write two files of 1MB file to exceed the quota
+            self._do_subvolume_io(subvolname, subvolume_group=group, create_dir='dir1', number_of_files=2)
+            # For quota to be enforced
+            time.sleep(20)
+            # create 400 files of 1MB to exceed quota
+            self._do_subvolume_io(subvolname, subvolume_group=group, number_of_files=400)
+        except CommandFailedError:
+            # remove with snapshot retention
+            self._fs_cmd("subvolume", "rm", self.volname, subvolname, "--group_name", group, "--retain-snapshots")
+            # remove snapshot1
+            self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot1, "--group_name", group)
+            # remove snapshot2 (should remove volume)
+            self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot2, "--group_name", group)
+            # verify subvolume trash is clean
+            self._wait_for_subvol_trash_empty(subvolname, group=group)
+        else:
+            self.fail(f"expected filling subvolume {subvolname} with 400 files of size 1MB to fail")
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_group_quota_subvolume_removal(self):
+        """
+        Tests subvolume removal if it's group quota is set.
+        """
+        # create group with size -- should set quota
+        group = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, group, "1000000000")
+
+        # create subvolume under the group
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group)
+
+        # remove subvolume
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolname, "--group_name", group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume rm' command to succeed if group quota is set")
+
+        # remove subvolumegroup
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_group_quota_legacy_subvolume_removal(self):
+        """
+        Tests legacy subvolume removal if it's group quota is set.
+        """
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # emulate a old-fashioned subvolume -- in a custom group
+        createpath1 = os.path.join(".", "volumes", group, subvolume)
+        self.mount_a.run_shell(['mkdir', '-p', createpath1], sudo=True)
+
+        # this would auto-upgrade on access without anyone noticing
+        subvolpath1 = self._fs_cmd("subvolume", "getpath", self.volname, subvolume, "--group-name", group)
+        self.assertNotEqual(subvolpath1, None)
+        subvolpath1 = subvolpath1.rstrip() # remove "/" prefix and any trailing newline
+
+        # and... the subvolume path returned should be what we created behind the scene
+        self.assertEqual(createpath1[1:], subvolpath1)
+
+        # Set subvolumegroup quota on idempotent subvolumegroup creation
+        self._fs_cmd("subvolumegroup", "create", self.volname, group, "1000000000")
+
+        # remove subvolume
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume rm' command to succeed if group quota is set")
+
+        # remove subvolumegroup
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_group_quota_v1_subvolume_removal(self):
+        """
+        Tests v1 subvolume removal if it's group quota is set.
+        """
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # emulate a v1 subvolume -- in a custom group
+        self._create_v1_subvolume(subvolume, subvol_group=group, has_snapshot=False)
+
+        # Set subvolumegroup quota on idempotent subvolumegroup creation
+        self._fs_cmd("subvolumegroup", "create", self.volname, group, "1000000000")
+
+        # remove subvolume
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume rm' command to succeed if group quota is set")
+
+        # remove subvolumegroup
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_group_resize_fail_invalid_size(self):
+        """
+        That a subvolume group cannot be resized to an invalid size and the quota did not change
+        """
+
+        osize = self.DEFAULT_FILE_SIZE*1024*1024
+        # create group with 1MB quota
+        group = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, group, "--size", str(osize))
+
+        # make sure it exists
+        grouppath = self._get_subvolume_group_path(self.volname, group)
+        self.assertNotEqual(grouppath, None)
+
+        # try to resize the subvolume with an invalid size -10
+        nsize = -10
+        try:
+            self._fs_cmd("subvolumegroup", "resize", self.volname, group, str(nsize))
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL,
+                             "invalid error code on resize of subvolume group with invalid size")
+        else:
+            self.fail("expected the 'fs subvolumegroup resize' command to fail")
+
+        # verify the quota did not change
+        size = int(self.mount_a.getfattr(grouppath, "ceph.quota.max_bytes"))
+        self.assertEqual(size, osize)
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_group_resize_fail_zero_size(self):
+        """
+        That a subvolume group cannot be resized to a zero size and the quota did not change
+        """
+
+        osize = self.DEFAULT_FILE_SIZE*1024*1024
+        # create group with 1MB quota
+        group = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, group, "--size", str(osize))
+
+        # make sure it exists
+        grouppath = self._get_subvolume_group_path(self.volname, group)
+        self.assertNotEqual(grouppath, None)
+
+        # try to resize the subvolume group with size 0
+        nsize = 0
+        try:
+            self._fs_cmd("subvolumegroup", "resize", self.volname, group, str(nsize))
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL,
+                             "invalid error code on resize of subvolume group with invalid size")
+        else:
+            self.fail("expected the 'fs subvolumegroup resize' command to fail")
+
+        # verify the quota did not change
+        size = int(self.mount_a.getfattr(grouppath, "ceph.quota.max_bytes"))
+        self.assertEqual(size, osize)
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_group_resize_quota_lt_used_size(self):
+        """
+        That a subvolume group can be resized to a size smaller than the current used size
+        and the resulting quota matches the expected size.
+        """
+
+        osize = self.DEFAULT_FILE_SIZE*1024*1024*20
+        # create group with 20MB quota
+        group = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, group,
+                     "--size", str(osize), "--mode=777")
+
+        # make sure it exists
+        grouppath = self._get_subvolume_group_path(self.volname, group)
+        self.assertNotEqual(grouppath, None)
+
+        # create subvolume under the group
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname,
+                     "--group_name", group, "--mode=777")
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname, group_name=group)
+        self.assertNotEqual(subvolpath, None)
+
+        # create one file of 10MB
+        file_size=self.DEFAULT_FILE_SIZE*10
+        number_of_files=1
+        log.debug("filling subvolume {0} with {1} file of size {2}MB".format(subvolname,
+                                                                             number_of_files,
+                                                                             file_size))
+        filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, self.DEFAULT_NUMBER_OF_FILES+1)
+        self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size)
+
+        usedsize = int(self.mount_a.getfattr(subvolpath, "ceph.dir.rbytes"))
+
+        # shrink the subvolume group
+        nsize = usedsize // 2
+        try:
+            self._fs_cmd("subvolumegroup", "resize", self.volname, group, str(nsize))
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolumegroup resize' command to succeed")
+
+        # verify the quota
+        size = int(self.mount_a.getfattr(grouppath, "ceph.quota.max_bytes"))
+        self.assertEqual(size, nsize)
+
+        # remove subvolume and group
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_group_resize_fail_quota_lt_used_size_no_shrink(self):
+        """
+        That a subvolume group cannot be resized to a size smaller than the current used size
+        when --no_shrink is given and the quota did not change.
+        """
+
+        osize = self.DEFAULT_FILE_SIZE*1024*1024*20
+        # create group with 20MB quota
+        group = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, group,
+                     "--size", str(osize), "--mode=777")
+
+        # make sure it exists
+        grouppath = self._get_subvolume_group_path(self.volname, group)
+        self.assertNotEqual(grouppath, None)
+
+        # create subvolume under the group
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname,
+                     "--group_name", group, "--mode=777")
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname, group_name=group)
+        self.assertNotEqual(subvolpath, None)
+
+        # create one file of 10MB
+        file_size=self.DEFAULT_FILE_SIZE*10
+        number_of_files=1
+        log.debug("filling subvolume {0} with {1} file of size {2}MB".format(subvolname,
+                                                                             number_of_files,
+                                                                             file_size))
+        filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, self.DEFAULT_NUMBER_OF_FILES+2)
+        self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size)
+
+        usedsize = int(self.mount_a.getfattr(grouppath, "ceph.dir.rbytes"))
+
+        # shrink the subvolume group
+        nsize = usedsize // 2
+        try:
+            self._fs_cmd("subvolumegroup", "resize", self.volname, group, str(nsize), "--no_shrink")
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on resize of subvolumegroup with quota less than used")
+        else:
+            self.fail("expected the 'fs subvolumegroup resize' command to fail")
+
+        # verify the quota did not change
+        size = int(self.mount_a.getfattr(grouppath, "ceph.quota.max_bytes"))
+        self.assertEqual(size, osize)
+
+        # remove subvolume and group
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_group_resize_expand_on_full_subvolume(self):
+        """
+        That the subvolume group can be expanded after it is full and future write succeed
+        """
+
+        osize = self.DEFAULT_FILE_SIZE*1024*1024*100
+        # create group with 100MB quota
+        group = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, group,
+                     "--size", str(osize), "--mode=777")
+
+        # make sure it exists
+        grouppath = self._get_subvolume_group_path(self.volname, group)
+        self.assertNotEqual(grouppath, None)
+
+        # create subvolume under the group
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname,
+                     "--group_name", group, "--mode=777")
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname, group_name=group)
+        self.assertNotEqual(subvolpath, None)
+
+        # create 99 files of 1MB
+        self._do_subvolume_io(subvolname, subvolume_group=group, number_of_files=99)
+
+        try:
+            # write two files of 1MB file to exceed the quota
+            self._do_subvolume_io(subvolname, subvolume_group=group, create_dir='dir1', number_of_files=2)
+            # For quota to be enforced
+            time.sleep(20)
+            # create 500 files of 1MB
+            self._do_subvolume_io(subvolname, subvolume_group=group, create_dir='dir1', number_of_files=500)
+        except CommandFailedError:
+            # Not able to write. So expand the subvolumegroup more and try writing the files again
+            nsize = osize*7
+            self._fs_cmd("subvolumegroup", "resize", self.volname, group, str(nsize))
+            try:
+                self._do_subvolume_io(subvolname, subvolume_group=group, create_dir='dir1', number_of_files=500)
+            except CommandFailedError:
+                self.fail("expected filling subvolume {0} with 500 files of size 1MB "
+                          "to succeed".format(subvolname))
+        else:
+            self.fail("expected filling subvolume {0} with 500 files of size 1MB "
+                      "to fail".format(subvolname))
+
+        # remove subvolume and group
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_group_resize_infinite_size(self):
+        """
+        That a subvolume group can be resized to an infinite size by unsetting its quota.
+        """
+
+        osize = self.DEFAULT_FILE_SIZE*1024*1024
+        # create group
+        group = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, group,
+                     "--size", str(osize))
+
+        # make sure it exists
+        grouppath = self._get_subvolume_group_path(self.volname, group)
+        self.assertNotEqual(grouppath, None)
+
+        # resize inf
+        self._fs_cmd("subvolumegroup", "resize", self.volname, group, "inf")
+
+        # verify that the quota is None
+        size = self.mount_a.getfattr(grouppath, "ceph.quota.max_bytes")
+        self.assertEqual(size, None)
+
+        # remove subvolume group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_group_resize_infinite_size_future_writes(self):
+        """
+        That a subvolume group can be resized to an infinite size and the future writes succeed.
+        """
+
+        osize = self.DEFAULT_FILE_SIZE*1024*1024*5
+        # create group with 5MB quota
+        group = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, group,
+                     "--size", str(osize), "--mode=777")
+
+        # make sure it exists
+        grouppath = self._get_subvolume_group_path(self.volname, group)
+        self.assertNotEqual(grouppath, None)
+
+        # create subvolume under the group
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname,
+                     "--group_name", group, "--mode=777")
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname, group_name=group)
+        self.assertNotEqual(subvolpath, None)
+
+        # create 4 files of 1MB
+        self._do_subvolume_io(subvolname, subvolume_group=group, number_of_files=4)
+
+        try:
+            # write two files of 1MB file to exceed the quota
+            self._do_subvolume_io(subvolname, subvolume_group=group, create_dir='dir1', number_of_files=2)
+            # For quota to be enforced
+            time.sleep(20)
+            # create 500 files of 1MB
+            self._do_subvolume_io(subvolname, subvolume_group=group, create_dir='dir1', number_of_files=500)
+        except CommandFailedError:
+            # Not able to write. So resize subvolumegroup to 'inf' and try writing the files again
+            # resize inf
+            self._fs_cmd("subvolumegroup", "resize", self.volname, group, "inf")
+            try:
+                self._do_subvolume_io(subvolname, subvolume_group=group, create_dir='dir1', number_of_files=500)
+            except CommandFailedError:
+                self.fail("expected filling subvolume {0} with 500 files of size 1MB "
+                          "to succeed".format(subvolname))
+        else:
+            self.fail("expected filling subvolume {0} with 500 files of size 1MB "
+                      "to fail".format(subvolname))
+
+
+        # verify that the quota is None
+        size = self.mount_a.getfattr(grouppath, "ceph.quota.max_bytes")
+        self.assertEqual(size, None)
+
+        # remove subvolume and group
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_group_ls(self):
+        # tests the 'fs subvolumegroup ls' command
+
+        subvolumegroups = []
+
+        #create subvolumegroups
+        subvolumegroups = self._generate_random_group_name(3)
+        for groupname in subvolumegroups:
+            self._fs_cmd("subvolumegroup", "create", self.volname, groupname)
+
+        subvolumegroupls = json.loads(self._fs_cmd('subvolumegroup', 'ls', self.volname))
+        if len(subvolumegroupls) == 0:
+            raise RuntimeError("Expected the 'fs subvolumegroup ls' command to list the created subvolume groups")
+        else:
+            subvolgroupnames = [subvolumegroup['name'] for subvolumegroup in subvolumegroupls]
+            if collections.Counter(subvolgroupnames) != collections.Counter(subvolumegroups):
+                raise RuntimeError("Error creating or listing subvolume groups")
+
+    def test_subvolume_group_ls_filter(self):
+        # tests the 'fs subvolumegroup ls' command filters '_deleting' directory
+
+        subvolumegroups = []
+
+        #create subvolumegroup
+        subvolumegroups = self._generate_random_group_name(3)
+        for groupname in subvolumegroups:
+            self._fs_cmd("subvolumegroup", "create", self.volname, groupname)
+
+        # create subvolume and remove. This creates '_deleting' directory.
+        subvolume = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        subvolumegroupls = json.loads(self._fs_cmd('subvolumegroup', 'ls', self.volname))
+        subvolgroupnames = [subvolumegroup['name'] for subvolumegroup in subvolumegroupls]
+        if "_deleting" in subvolgroupnames:
+            self.fail("Listing subvolume groups listed '_deleting' directory")
+
+    def test_subvolume_group_ls_filter_internal_directories(self):
+        # tests the 'fs subvolumegroup ls' command filters internal directories
+        # eg: '_deleting', '_nogroup', '_index', "_legacy"
+
+        subvolumegroups = self._generate_random_group_name(3)
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        #create subvolumegroups
+        for groupname in subvolumegroups:
+            self._fs_cmd("subvolumegroup", "create", self.volname, groupname)
+
+        # create subvolume which will create '_nogroup' directory
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # create snapshot
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # clone snapshot which will create '_index' directory
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # wait for clone to complete
+        self._wait_for_clone_to_complete(clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolume which will create '_deleting' directory
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # list subvolumegroups
+        ret = json.loads(self._fs_cmd('subvolumegroup', 'ls', self.volname))
+        self.assertEqual(len(ret), len(subvolumegroups))
+
+        ret_list = [subvolumegroup['name'] for subvolumegroup in ret]
+        self.assertEqual(len(ret_list), len(subvolumegroups))
+
+        self.assertEqual(all(elem in subvolumegroups for elem in ret_list), True)
+
+        # cleanup
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+        for groupname in subvolumegroups:
+            self._fs_cmd("subvolumegroup", "rm", self.volname, groupname)
+
+    def test_subvolume_group_ls_for_nonexistent_volume(self):
+        # tests the 'fs subvolumegroup ls' command when /volume doesn't exist
+        # prerequisite: we expect that the test volume is created and a subvolumegroup is NOT created
+
+        # list subvolume groups
+        subvolumegroupls = json.loads(self._fs_cmd('subvolumegroup', 'ls', self.volname))
+        if len(subvolumegroupls) > 0:
+            raise RuntimeError("Expected the 'fs subvolumegroup ls' command to output an empty list")
+
+    def test_subvolumegroup_pin_distributed(self):
+        self.fs.set_max_mds(2)
+        status = self.fs.wait_for_daemons()
+        self.config_set('mds', 'mds_export_ephemeral_distributed', True)
+
+        group = "pinme"
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+        self._fs_cmd("subvolumegroup", "pin", self.volname, group, "distributed", "True")
+        subvolumes = self._generate_random_subvolume_name(50)
+        for subvolume in subvolumes:
+            self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+        self._wait_distributed_subtrees(2 * 2, status=status, rank="all")
+
+        # remove subvolumes
+        for subvolume in subvolumes:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_group_rm_force(self):
+        # test removing non-existing subvolume group with --force
+        group = self._generate_random_group_name()
+        try:
+            self._fs_cmd("subvolumegroup", "rm", self.volname, group, "--force")
+        except CommandFailedError:
+            raise RuntimeError("expected the 'fs subvolumegroup rm --force' command to succeed")
+
+    def test_subvolume_group_exists_with_subvolumegroup_and_no_subvolume(self):
+        """Test the presence of any subvolumegroup when only subvolumegroup is present"""
+
+        group = self._generate_random_group_name()
+        # create subvolumegroup
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+        ret = self._fs_cmd("subvolumegroup", "exist", self.volname)
+        self.assertEqual(ret.strip('\n'), "subvolumegroup exists")
+        # delete subvolumegroup
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+        ret = self._fs_cmd("subvolumegroup", "exist", self.volname)
+        self.assertEqual(ret.strip('\n'), "no subvolumegroup exists")
+
+    def test_subvolume_group_exists_with_no_subvolumegroup_and_subvolume(self):
+        """Test the presence of any subvolumegroup when no subvolumegroup is present"""
+
+        ret = self._fs_cmd("subvolumegroup", "exist", self.volname)
+        self.assertEqual(ret.strip('\n'), "no subvolumegroup exists")
+
+    def test_subvolume_group_exists_with_subvolumegroup_and_subvolume(self):
+        """Test the presence of any subvolume when subvolumegroup
+            and subvolume both are present"""
+
+        group = self._generate_random_group_name()
+        subvolume = self._generate_random_subvolume_name(2)
+        # create subvolumegroup
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume[0], "--group_name", group)
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume[1])
+        ret = self._fs_cmd("subvolumegroup", "exist", self.volname)
+        self.assertEqual(ret.strip('\n'), "subvolumegroup exists")
+        # delete subvolume in group
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume[0], "--group_name", group)
+        ret = self._fs_cmd("subvolumegroup", "exist", self.volname)
+        self.assertEqual(ret.strip('\n'), "subvolumegroup exists")
+        # delete subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume[1])
+        ret = self._fs_cmd("subvolumegroup", "exist", self.volname)
+        self.assertEqual(ret.strip('\n'), "subvolumegroup exists")
+        # delete subvolumegroup
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+        ret = self._fs_cmd("subvolumegroup", "exist", self.volname)
+        self.assertEqual(ret.strip('\n'), "no subvolumegroup exists")
+
+    def test_subvolume_group_exists_without_subvolumegroup_and_with_subvolume(self):
+        """Test the presence of any subvolume when subvolume is present
+            but no subvolumegroup is present"""
+
+        subvolume = self._generate_random_subvolume_name()
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+        ret = self._fs_cmd("subvolumegroup", "exist", self.volname)
+        self.assertEqual(ret.strip('\n'), "no subvolumegroup exists")
+        # delete subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        ret = self._fs_cmd("subvolumegroup", "exist", self.volname)
+        self.assertEqual(ret.strip('\n'), "no subvolumegroup exists")
+
+
+class TestSubvolumes(TestVolumesHelper):
+    """Tests for FS subvolume operations, except snapshot and snapshot clone."""
+    def test_async_subvolume_rm(self):
+        subvolumes = self._generate_random_subvolume_name(100)
+
+        # create subvolumes
+        for subvolume in subvolumes:
+            self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+            self._do_subvolume_io(subvolume, number_of_files=10)
+
+        self.mount_a.umount_wait()
+
+        # remove subvolumes
+        for subvolume in subvolumes:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        self.mount_a.mount_wait()
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty(timeout=300)
+
+    def test_default_uid_gid_subvolume(self):
+        subvolume = self._generate_random_subvolume_name()
+        expected_uid = 0
+        expected_gid = 0
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+        subvol_path = self._get_subvolume_path(self.volname, subvolume)
+
+        # check subvolume's uid and gid
+        stat = self.mount_a.stat(subvol_path)
+        self.assertEqual(stat['st_uid'], expected_uid)
+        self.assertEqual(stat['st_gid'], expected_gid)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_nonexistent_subvolume_rm(self):
+        # remove non-existing subvolume
+        subvolume = "non_existent_subvolume"
+
+        # try, remove subvolume
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOENT:
+                raise
+        else:
+            raise RuntimeError("expected the 'fs subvolume rm' command to fail")
+
+    def test_subvolume_create_and_rm(self):
+        # create subvolume
+        subvolume = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # make sure it exists
+        subvolpath = self._fs_cmd("subvolume", "getpath", self.volname, subvolume)
+        self.assertNotEqual(subvolpath, None)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        # make sure its gone
+        try:
+            self._fs_cmd("subvolume", "getpath", self.volname, subvolume)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOENT:
+                raise
+        else:
+            raise RuntimeError("expected the 'fs subvolume getpath' command to fail. Subvolume not removed.")
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_create_and_rm_in_group(self):
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_create_idempotence(self):
+        # create subvolume
+        subvolume = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # try creating w/ same subvolume name -- should be idempotent
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_create_idempotence_resize(self):
+        # create subvolume
+        subvolume = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # try creating w/ same subvolume name with size -- should set quota
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "1000000000")
+
+        # get subvolume metadata
+        subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume))
+        self.assertEqual(subvol_info["bytes_quota"], 1000000000)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_create_idempotence_mode(self):
+        # default mode
+        default_mode = "755"
+
+        # create subvolume
+        subvolume = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        subvol_path = self._get_subvolume_path(self.volname, subvolume)
+
+        actual_mode_1 = self.mount_a.run_shell(['stat', '-c' '%a', subvol_path]).stdout.getvalue().strip()
+        self.assertEqual(actual_mode_1, default_mode)
+
+        # try creating w/ same subvolume name with --mode 777
+        new_mode = "777"
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode", new_mode)
+
+        actual_mode_2 = self.mount_a.run_shell(['stat', '-c' '%a', subvol_path]).stdout.getvalue().strip()
+        self.assertEqual(actual_mode_2, new_mode)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_create_idempotence_without_passing_mode(self):
+        # create subvolume
+        desired_mode = "777"
+        subvolume = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode", desired_mode)
+
+        subvol_path = self._get_subvolume_path(self.volname, subvolume)
+
+        actual_mode_1 = self.mount_a.run_shell(['stat', '-c' '%a', subvol_path]).stdout.getvalue().strip()
+        self.assertEqual(actual_mode_1, desired_mode)
+
+        # default mode
+        default_mode = "755"
+
+        # try creating w/ same subvolume name without passing --mode argument
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        actual_mode_2 = self.mount_a.run_shell(['stat', '-c' '%a', subvol_path]).stdout.getvalue().strip()
+        self.assertEqual(actual_mode_2, default_mode)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_create_isolated_namespace(self):
+        """
+        Create subvolume in separate rados namespace
+        """
+
+        # create subvolume
+        subvolume = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--namespace-isolated")
+
+        # get subvolume metadata
+        subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume))
+        self.assertNotEqual(len(subvol_info), 0)
+        self.assertEqual(subvol_info["pool_namespace"], "fsvolumens_" + subvolume)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_create_with_auto_cleanup_on_fail(self):
+        subvolume = self._generate_random_subvolume_name()
+        data_pool = "invalid_pool"
+        # create subvolume with invalid data pool layout fails
+        with self.assertRaises(CommandFailedError):
+            self._fs_cmd("subvolume", "create", self.volname, subvolume, "--pool_layout", data_pool)
+
+        # check whether subvol path is cleaned up
+        try:
+            self._fs_cmd("subvolume", "getpath", self.volname, subvolume)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on getpath of non-existent subvolume")
+        else:
+            self.fail("expected the 'fs subvolume getpath' command to fail")
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_create_with_desired_data_pool_layout_in_group(self):
+        subvol1, subvol2 = self._generate_random_subvolume_name(2)
+        group = self._generate_random_group_name()
+
+        # create group. this also helps set default pool layout for subvolumes
+        # created within the group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvol1, "--group_name", group)
+        subvol1_path = self._get_subvolume_path(self.volname, subvol1, group_name=group)
+
+        default_pool = self.mount_a.getfattr(subvol1_path, "ceph.dir.layout.pool")
+        new_pool = "new_pool"
+        self.assertNotEqual(default_pool, new_pool)
+
+        # add data pool
+        newid = self.fs.add_data_pool(new_pool)
+
+        # create subvolume specifying the new data pool as its pool layout
+        self._fs_cmd("subvolume", "create", self.volname, subvol2, "--group_name", group,
+                     "--pool_layout", new_pool)
+        subvol2_path = self._get_subvolume_path(self.volname, subvol2, group_name=group)
+
+        desired_pool = self.mount_a.getfattr(subvol2_path, "ceph.dir.layout.pool")
+        try:
+            self.assertEqual(desired_pool, new_pool)
+        except AssertionError:
+            self.assertEqual(int(desired_pool), newid) # old kernel returns id
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvol2, group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvol1, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_create_with_desired_mode(self):
+        subvol1 = self._generate_random_subvolume_name()
+
+        # default mode
+        default_mode = "755"
+        # desired mode
+        desired_mode = "777"
+
+        self._fs_cmd("subvolume", "create", self.volname, subvol1,  "--mode", "777")
+
+        subvol1_path = self._get_subvolume_path(self.volname, subvol1)
+
+        # check subvolumegroup's mode
+        subvol_par_path = os.path.dirname(subvol1_path)
+        group_path = os.path.dirname(subvol_par_path)
+        actual_mode1 = self.mount_a.run_shell(['stat', '-c' '%a', group_path]).stdout.getvalue().strip()
+        self.assertEqual(actual_mode1, default_mode)
+        # check /volumes mode
+        volumes_path = os.path.dirname(group_path)
+        actual_mode2 = self.mount_a.run_shell(['stat', '-c' '%a', volumes_path]).stdout.getvalue().strip()
+        self.assertEqual(actual_mode2, default_mode)
+        # check subvolume's  mode
+        actual_mode3 = self.mount_a.run_shell(['stat', '-c' '%a', subvol1_path]).stdout.getvalue().strip()
+        self.assertEqual(actual_mode3, desired_mode)
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvol1)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_create_with_desired_mode_in_group(self):
+        subvol1, subvol2, subvol3 = self._generate_random_subvolume_name(3)
+
+        group = self._generate_random_group_name()
+        # default mode
+        expected_mode1 = "755"
+        # desired mode
+        expected_mode2 = "777"
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvol1, "--group_name", group)
+        self._fs_cmd("subvolume", "create", self.volname, subvol2, "--group_name", group, "--mode", "777")
+        # check whether mode 0777 also works
+        self._fs_cmd("subvolume", "create", self.volname, subvol3, "--group_name", group, "--mode", "0777")
+
+        subvol1_path = self._get_subvolume_path(self.volname, subvol1, group_name=group)
+        subvol2_path = self._get_subvolume_path(self.volname, subvol2, group_name=group)
+        subvol3_path = self._get_subvolume_path(self.volname, subvol3, group_name=group)
+
+        # check subvolume's  mode
+        actual_mode1 = self.mount_a.run_shell(['stat', '-c' '%a', subvol1_path]).stdout.getvalue().strip()
+        actual_mode2 = self.mount_a.run_shell(['stat', '-c' '%a', subvol2_path]).stdout.getvalue().strip()
+        actual_mode3 = self.mount_a.run_shell(['stat', '-c' '%a', subvol3_path]).stdout.getvalue().strip()
+        self.assertEqual(actual_mode1, expected_mode1)
+        self.assertEqual(actual_mode2, expected_mode2)
+        self.assertEqual(actual_mode3, expected_mode2)
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvol1, group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvol2, group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvol3, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_create_with_desired_uid_gid(self):
+        """
+        That the subvolume can be created with the desired uid and gid and its uid and gid matches the
+        expected values.
+        """
+        uid = 1000
+        gid = 1000
+
+        # create subvolume
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--uid", str(uid), "--gid", str(gid))
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname)
+        self.assertNotEqual(subvolpath, None)
+
+        # verify the uid and gid
+        suid = int(self.mount_a.run_shell(['stat', '-c' '%u', subvolpath]).stdout.getvalue().strip())
+        sgid = int(self.mount_a.run_shell(['stat', '-c' '%g', subvolpath]).stdout.getvalue().strip())
+        self.assertEqual(uid, suid)
+        self.assertEqual(gid, sgid)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_create_with_invalid_data_pool_layout(self):
+        subvolume = self._generate_random_subvolume_name()
+        data_pool = "invalid_pool"
+        # create subvolume with invalid data pool layout
+        try:
+            self._fs_cmd("subvolume", "create", self.volname, subvolume, "--pool_layout", data_pool)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on create of subvolume with invalid pool layout")
+        else:
+            self.fail("expected the 'fs subvolume create' command to fail")
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_create_with_invalid_size(self):
+        # create subvolume with an invalid size -1
+        subvolume = self._generate_random_subvolume_name()
+        try:
+            self._fs_cmd("subvolume", "create", self.volname, subvolume, "--size", "-1")
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on create of subvolume with invalid size")
+        else:
+            self.fail("expected the 'fs subvolume create' command to fail")
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_create_and_ls_providing_group_as_nogroup(self):
+        """
+        That a 'subvolume create' and 'subvolume ls' should throw
+        permission denied error if option --group=_nogroup is provided.
+        """
+
+        subvolname = self._generate_random_subvolume_name()
+
+        # try to create subvolume providing --group_name=_nogroup option
+        try:
+            self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", "_nogroup")
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EPERM)
+        else:
+            self.fail("expected the 'fs subvolume create' command to fail")
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolname)
+
+        # try to list subvolumes providing --group_name=_nogroup option
+        try:
+            self._fs_cmd("subvolume", "ls", self.volname, "--group_name", "_nogroup")
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EPERM)
+        else:
+            self.fail("expected the 'fs subvolume ls' command to fail")
+
+        # list subvolumes
+        self._fs_cmd("subvolume", "ls", self.volname)
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_expand(self):
+        """
+        That a subvolume can be expanded in size and its quota matches the expected size.
+        """
+
+        # create subvolume
+        subvolname = self._generate_random_subvolume_name()
+        osize = self.DEFAULT_FILE_SIZE*1024*1024
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", str(osize))
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname)
+        self.assertNotEqual(subvolpath, None)
+
+        # expand the subvolume
+        nsize = osize*2
+        self._fs_cmd("subvolume", "resize", self.volname, subvolname, str(nsize))
+
+        # verify the quota
+        size = int(self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes"))
+        self.assertEqual(size, nsize)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_info(self):
+        # tests the 'fs subvolume info' command
+
+        subvol_md = ["atime", "bytes_pcent", "bytes_quota", "bytes_used", "created_at", "ctime",
+                     "data_pool", "gid", "mode", "mon_addrs", "mtime", "path", "pool_namespace",
+                     "type", "uid", "features", "state"]
+
+        # create subvolume
+        subvolume = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # get subvolume metadata
+        subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume))
+        for md in subvol_md:
+            self.assertIn(md, subvol_info, "'{0}' key not present in metadata of subvolume".format(md))
+
+        self.assertEqual(subvol_info["bytes_pcent"], "undefined", "bytes_pcent should be set to undefined if quota is not set")
+        self.assertEqual(subvol_info["bytes_quota"], "infinite", "bytes_quota should be set to infinite if quota is not set")
+        self.assertEqual(subvol_info["pool_namespace"], "", "expected pool namespace to be empty")
+        self.assertEqual(subvol_info["state"], "complete", "expected state to be complete")
+
+        self.assertEqual(len(subvol_info["features"]), 3,
+                         msg="expected 3 features, found '{0}' ({1})".format(len(subvol_info["features"]), subvol_info["features"]))
+        for feature in ['snapshot-clone', 'snapshot-autoprotect', 'snapshot-retention']:
+            self.assertIn(feature, subvol_info["features"], msg="expected feature '{0}' in subvolume".format(feature))
+
+        nsize = self.DEFAULT_FILE_SIZE*1024*1024
+        self._fs_cmd("subvolume", "resize", self.volname, subvolume, str(nsize))
+
+        # get subvolume metadata after quota set
+        subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume))
+        for md in subvol_md:
+            self.assertIn(md, subvol_info, "'{0}' key not present in metadata of subvolume".format(md))
+
+        self.assertNotEqual(subvol_info["bytes_pcent"], "undefined", "bytes_pcent should not be set to undefined if quota is not set")
+        self.assertEqual(subvol_info["bytes_quota"], nsize, "bytes_quota should be set to '{0}'".format(nsize))
+        self.assertEqual(subvol_info["type"], "subvolume", "type should be set to subvolume")
+        self.assertEqual(subvol_info["state"], "complete", "expected state to be complete")
+
+        self.assertEqual(len(subvol_info["features"]), 3,
+                         msg="expected 3 features, found '{0}' ({1})".format(len(subvol_info["features"]), subvol_info["features"]))
+        for feature in ['snapshot-clone', 'snapshot-autoprotect', 'snapshot-retention']:
+            self.assertIn(feature, subvol_info["features"], msg="expected feature '{0}' in subvolume".format(feature))
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_ls(self):
+        # tests the 'fs subvolume ls' command
+
+        subvolumes = []
+
+        # create subvolumes
+        subvolumes = self._generate_random_subvolume_name(3)
+        for subvolume in subvolumes:
+            self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # list subvolumes
+        subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        if len(subvolumels) == 0:
+            self.fail("Expected the 'fs subvolume ls' command to list the created subvolumes.")
+        else:
+            subvolnames = [subvolume['name'] for subvolume in subvolumels]
+            if collections.Counter(subvolnames) != collections.Counter(subvolumes):
+                self.fail("Error creating or listing subvolumes")
+
+        # remove subvolume
+        for subvolume in subvolumes:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_ls_with_groupname_as_internal_directory(self):
+        # tests the 'fs subvolume ls' command when the default groupname as internal directories
+        # Eg: '_nogroup', '_legacy', '_deleting', '_index'.
+        # Expecting 'fs subvolume ls' will be fail with errno EINVAL for '_legacy', '_deleting', '_index'
+        # Expecting 'fs subvolume ls' will be fail with errno EPERM for '_nogroup'
+
+        # try to list subvolumes providing --group_name=_nogroup option
+        try:
+            self._fs_cmd("subvolume", "ls", self.volname, "--group_name", "_nogroup")
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EPERM)
+        else:
+            self.fail("expected the 'fs subvolume ls' command to fail with error 'EPERM' for _nogroup")
+
+        # try to list subvolumes providing --group_name=_legacy option
+        try:
+            self._fs_cmd("subvolume", "ls", self.volname, "--group_name", "_legacy")
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL)
+        else:
+            self.fail("expected the 'fs subvolume ls' command to fail with error 'EINVAL' for _legacy")
+
+        # try to list subvolumes providing --group_name=_deleting option
+        try:
+            self._fs_cmd("subvolume", "ls", self.volname, "--group_name", "_deleting")
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL)
+        else:
+            self.fail("expected the 'fs subvolume ls' command to fail with error 'EINVAL' for _deleting")
+
+        # try to list subvolumes providing --group_name=_index option
+        try:
+            self._fs_cmd("subvolume", "ls", self.volname, "--group_name", "_index")
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL)
+        else:
+            self.fail("expected the 'fs subvolume ls' command to fail with error 'EINVAL' for _index")
+
+    def test_subvolume_ls_for_notexistent_default_group(self):
+        # tests the 'fs subvolume ls' command when the default group '_nogroup' doesn't exist
+        # prerequisite: we expect that the volume is created and the default group _nogroup is
+        # NOT created (i.e. a subvolume without group is not created)
+
+        # list subvolumes
+        subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        if len(subvolumels) > 0:
+            raise RuntimeError("Expected the 'fs subvolume ls' command to output an empty list.")
+
+    def test_subvolume_marked(self):
+        """
+        ensure a subvolume is marked with the ceph.dir.subvolume xattr
+        """
+        subvolume = self._generate_random_subvolume_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # getpath
+        subvolpath = self._get_subvolume_path(self.volname, subvolume)
+
+        # subdirectory of a subvolume cannot be moved outside the subvolume once marked with
+        # the xattr ceph.dir.subvolume, hence test by attempting to rename subvol path (incarnation)
+        # outside the subvolume
+        dstpath = os.path.join(self.mount_a.mountpoint, 'volumes', '_nogroup', 'new_subvol_location')
+        srcpath = os.path.join(self.mount_a.mountpoint, subvolpath)
+        rename_script = dedent("""
+            import os
+            import errno
+            try:
+                os.rename("{src}", "{dst}")
+            except OSError as e:
+                if e.errno != errno.EXDEV:
+                    raise RuntimeError("invalid error code on renaming subvolume incarnation out of subvolume directory")
+            else:
+                raise RuntimeError("expected renaming subvolume incarnation out of subvolume directory to fail")
+            """)
+        self.mount_a.run_python(rename_script.format(src=srcpath, dst=dstpath), sudo=True)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_pin_export(self):
+        self.fs.set_max_mds(2)
+        status = self.fs.wait_for_daemons()
+
+        subvolume = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+        self._fs_cmd("subvolume", "pin", self.volname, subvolume, "export", "1")
+        path = self._fs_cmd("subvolume", "getpath", self.volname, subvolume)
+        path = os.path.dirname(path) # get subvolume path
+
+        self._get_subtrees(status=status, rank=1)
+        self._wait_subtrees([(path, 1)], status=status)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    ### authorize operations
+
+    def test_authorize_deauthorize_legacy_subvolume(self):
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        authid = "alice"
+
+        guest_mount = self.mount_b
+        guest_mount.umount_wait()
+
+        # emulate a old-fashioned subvolume in a custom group
+        createpath = os.path.join(".", "volumes", group, subvolume)
+        self.mount_a.run_shell(['mkdir', '-p', createpath], sudo=True)
+
+        # add required xattrs to subvolume
+        default_pool = self.mount_a.getfattr(".", "ceph.dir.layout.pool")
+        self.mount_a.setfattr(createpath, 'ceph.dir.layout.pool', default_pool, sudo=True)
+
+        mount_path = os.path.join("/", "volumes", group, subvolume)
+
+        # authorize guest authID read-write access to subvolume
+        key = self._fs_cmd("subvolume", "authorize", self.volname, subvolume, authid,
+                           "--group_name", group, "--tenant_id", "tenant_id")
+
+        # guest authID should exist
+        existing_ids = [a['entity'] for a in self.auth_list()]
+        self.assertIn("client.{0}".format(authid), existing_ids)
+
+        # configure credentials for guest client
+        self._configure_guest_auth(guest_mount, authid, key)
+
+        # mount the subvolume, and write to it
+        guest_mount.mount_wait(cephfs_mntpt=mount_path)
+        guest_mount.write_n_mb("data.bin", 1)
+
+        # authorize guest authID read access to subvolume
+        key = self._fs_cmd("subvolume", "authorize", self.volname, subvolume, authid,
+                           "--group_name", group, "--tenant_id", "tenant_id", "--access_level", "r")
+
+        # guest client sees the change in access level to read only after a
+        # remount of the subvolume.
+        guest_mount.umount_wait()
+        guest_mount.mount_wait(cephfs_mntpt=mount_path)
+
+        # read existing content of the subvolume
+        self.assertListEqual(guest_mount.ls(guest_mount.mountpoint), ["data.bin"])
+        # cannot write into read-only subvolume
+        with self.assertRaises(CommandFailedError):
+            guest_mount.write_n_mb("rogue.bin", 1)
+
+        # cleanup
+        guest_mount.umount_wait()
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, authid,
+                     "--group_name", group)
+        # guest authID should no longer exist
+        existing_ids = [a['entity'] for a in self.auth_list()]
+        self.assertNotIn("client.{0}".format(authid), existing_ids)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_authorize_deauthorize_subvolume(self):
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        authid = "alice"
+
+        guest_mount = self.mount_b
+        guest_mount.umount_wait()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group, "--mode=777")
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+        mount_path = self._fs_cmd("subvolume", "getpath", self.volname, subvolume,
+                                  "--group_name", group).rstrip()
+
+        # authorize guest authID read-write access to subvolume
+        key = self._fs_cmd("subvolume", "authorize", self.volname, subvolume, authid,
+                           "--group_name", group, "--tenant_id", "tenant_id")
+
+        # guest authID should exist
+        existing_ids = [a['entity'] for a in self.auth_list()]
+        self.assertIn("client.{0}".format(authid), existing_ids)
+
+        # configure credentials for guest client
+        self._configure_guest_auth(guest_mount, authid, key)
+
+        # mount the subvolume, and write to it
+        guest_mount.mount_wait(cephfs_mntpt=mount_path)
+        guest_mount.write_n_mb("data.bin", 1)
+
+        # authorize guest authID read access to subvolume
+        key = self._fs_cmd("subvolume", "authorize", self.volname, subvolume, authid,
+                           "--group_name", group, "--tenant_id", "tenant_id", "--access_level", "r")
+
+        # guest client sees the change in access level to read only after a
+        # remount of the subvolume.
+        guest_mount.umount_wait()
+        guest_mount.mount_wait(cephfs_mntpt=mount_path)
+
+        # read existing content of the subvolume
+        self.assertListEqual(guest_mount.ls(guest_mount.mountpoint), ["data.bin"])
+        # cannot write into read-only subvolume
+        with self.assertRaises(CommandFailedError):
+            guest_mount.write_n_mb("rogue.bin", 1)
+
+        # cleanup
+        guest_mount.umount_wait()
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, authid,
+                     "--group_name", group)
+        # guest authID should no longer exist
+        existing_ids = [a['entity'] for a in self.auth_list()]
+        self.assertNotIn("client.{0}".format(authid), existing_ids)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_multitenant_subvolumes(self):
+        """
+        That subvolume access can be restricted to a tenant.
+
+        That metadata used to enforce tenant isolation of
+        subvolumes is stored as a two-way mapping between auth
+        IDs and subvolumes that they're authorized to access.
+        """
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        guest_mount = self.mount_b
+
+        # Guest clients belonging to different tenants, but using the same
+        # auth ID.
+        auth_id = "alice"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+        guestclient_2 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant2",
+        }
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # Check that subvolume metadata file is created on subvolume creation.
+        subvol_metadata_filename = "_{0}:{1}.meta".format(group, subvolume)
+        self.assertIn(subvol_metadata_filename, guest_mount.ls("volumes"))
+
+        # Authorize 'guestclient_1', using auth ID 'alice' and belonging to
+        # 'tenant1', with 'rw' access to the volume.
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+        # Check that auth metadata file for auth ID 'alice', is
+        # created on authorizing 'alice' access to the subvolume.
+        auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"])
+        self.assertIn(auth_metadata_filename, guest_mount.ls("volumes"))
+
+        # Verify that the auth metadata file stores the tenant ID that the
+        # auth ID belongs to, the auth ID's authorized access levels
+        # for different subvolumes, versioning details, etc.
+        expected_auth_metadata = {
+            "version": 5,
+            "compat_version": 6,
+            "dirty": False,
+            "tenant_id": "tenant1",
+            "subvolumes": {
+                "{0}/{1}".format(group,subvolume): {
+                    "dirty": False,
+                    "access_level": "rw"
+                }
+            }
+        }
+
+        auth_metadata = self._auth_metadata_get(guest_mount.read_file("volumes/{0}".format(auth_metadata_filename)))
+        self.assertGreaterEqual(auth_metadata["version"], expected_auth_metadata["version"])
+        del expected_auth_metadata["version"]
+        del auth_metadata["version"]
+        self.assertEqual(expected_auth_metadata, auth_metadata)
+
+        # Verify that the subvolume metadata file stores info about auth IDs
+        # and their access levels to the subvolume, versioning details, etc.
+        expected_subvol_metadata = {
+            "version": 1,
+            "compat_version": 1,
+            "auths": {
+                "alice": {
+                    "dirty": False,
+                    "access_level": "rw"
+                }
+            }
+        }
+        subvol_metadata = self._auth_metadata_get(guest_mount.read_file("volumes/{0}".format(subvol_metadata_filename)))
+
+        self.assertGreaterEqual(subvol_metadata["version"], expected_subvol_metadata["version"])
+        del expected_subvol_metadata["version"]
+        del subvol_metadata["version"]
+        self.assertEqual(expected_subvol_metadata, subvol_metadata)
+
+        # Cannot authorize 'guestclient_2' to access the volume.
+        # It uses auth ID 'alice', which has already been used by a
+        # 'guestclient_1' belonging to an another tenant for accessing
+        # the volume.
+
+        try:
+            self._fs_cmd("subvolume", "authorize", self.volname, subvolume, guestclient_2["auth_id"],
+                         "--group_name", group, "--tenant_id", guestclient_2["tenant_id"])
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EPERM,
+                             "Invalid error code returned on authorize of subvolume with same auth_id but different tenant_id")
+        else:
+            self.fail("expected the 'fs subvolume authorize' command to fail")
+
+        # Check that auth metadata file is cleaned up on removing
+        # auth ID's only access to a volume.
+
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, auth_id,
+                     "--group_name", group)
+        self.assertNotIn(auth_metadata_filename, guest_mount.ls("volumes"))
+
+        # Check that subvolume metadata file is cleaned up on subvolume deletion.
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group)
+        self.assertNotIn(subvol_metadata_filename, guest_mount.ls("volumes"))
+
+        # clean up
+        guest_mount.umount_wait()
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_authorized_list(self):
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        authid1 = "alice"
+        authid2 = "guest1"
+        authid3 = "guest2"
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # authorize alice authID read-write access to subvolume
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume, authid1,
+                     "--group_name", group)
+        # authorize guest1 authID read-write access to subvolume
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume, authid2,
+                     "--group_name", group)
+        # authorize guest2 authID read access to subvolume
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume, authid3,
+                     "--group_name", group, "--access_level", "r")
+
+        # list authorized-ids of the subvolume
+        expected_auth_list = [{'alice': 'rw'}, {'guest1': 'rw'}, {'guest2': 'r'}]
+        auth_list = json.loads(self._fs_cmd('subvolume', 'authorized_list', self.volname, subvolume, "--group_name", group))
+        self.assertCountEqual(expected_auth_list, auth_list)
+
+        # cleanup
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, authid1,
+                     "--group_name", group)
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, authid2,
+                     "--group_name", group)
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, authid3,
+                     "--group_name", group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_authorize_auth_id_not_created_by_mgr_volumes(self):
+        """
+        If the auth_id already exists and is not created by mgr plugin,
+        it's not allowed to authorize the auth-id by default.
+        """
+
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # Create auth_id
+        self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "get-or-create", "client.guest1",
+            "mds", "allow *",
+            "osd", "allow rw",
+            "mon", "allow *"
+        )
+
+        auth_id = "guest1"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        try:
+            self._fs_cmd("subvolume", "authorize", self.volname, subvolume, guestclient_1["auth_id"],
+                         "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EPERM,
+                             "Invalid error code returned on authorize of subvolume for auth_id created out of band")
+        else:
+            self.fail("expected the 'fs subvolume authorize' command to fail")
+
+        # clean up
+        self.fs.mon_manager.raw_cluster_cmd("auth", "rm", "client.guest1")
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_authorize_allow_existing_id_option(self):
+        """
+        If the auth_id already exists and is not created by mgr volumes,
+        it's not allowed to authorize the auth-id by default but is
+        allowed with option allow_existing_id.
+        """
+
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # Create auth_id
+        self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "get-or-create", "client.guest1",
+            "mds", "allow *",
+            "osd", "allow rw",
+            "mon", "allow *"
+        )
+
+        auth_id = "guest1"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # Cannot authorize 'guestclient_1' to access the volume by default,
+        # which already exists and not created by mgr volumes but is allowed
+        # with option 'allow_existing_id'.
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"], "--allow-existing-id")
+
+        # clean up
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, auth_id,
+                     "--group_name", group)
+        self.fs.mon_manager.raw_cluster_cmd("auth", "rm", "client.guest1")
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_deauthorize_auth_id_after_out_of_band_update(self):
+        """
+        If the auth_id authorized by mgr/volumes plugin is updated
+        out of band, the auth_id should not be deleted after a
+        deauthorize. It should only remove caps associated with it.
+        """
+
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        auth_id = "guest1"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # Authorize 'guestclient_1' to access the subvolume.
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+        subvol_path = self._fs_cmd("subvolume", "getpath", self.volname, subvolume,
+                                  "--group_name", group).rstrip()
+
+        # Update caps for guestclient_1 out of band
+        out = self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "caps", "client.guest1",
+            "mds", "allow rw path=/volumes/{0}, allow rw path={1}".format(group, subvol_path),
+            "osd", "allow rw pool=cephfs_data",
+            "mon", "allow r",
+            "mgr", "allow *"
+        )
+
+        # Deauthorize guestclient_1
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, auth_id, "--group_name", group)
+
+        # Validate the caps of guestclient_1 after deauthorize. It should not have deleted
+        # guestclient_1. The mgr and mds caps should be present which was updated out of band.
+        out = json.loads(self.fs.mon_manager.raw_cluster_cmd("auth", "get", "client.guest1", "--format=json-pretty"))
+
+        self.assertEqual("client.guest1", out[0]["entity"])
+        self.assertEqual("allow rw path=/volumes/{0}".format(group), out[0]["caps"]["mds"])
+        self.assertEqual("allow *", out[0]["caps"]["mgr"])
+        self.assertNotIn("osd", out[0]["caps"])
+
+        # clean up
+        out = self.fs.mon_manager.raw_cluster_cmd("auth", "rm", "client.guest1")
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_recover_auth_metadata_during_authorize(self):
+        """
+        That auth metadata manager can recover from partial auth updates using
+        metadata files, which store auth info and its update status info. This
+        test validates the recovery during authorize.
+        """
+
+        guest_mount = self.mount_b
+
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        auth_id = "guest1"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # Authorize 'guestclient_1' to access the subvolume.
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+        # Check that auth metadata file for auth ID 'guest1', is
+        # created on authorizing 'guest1' access to the subvolume.
+        auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"])
+        self.assertIn(auth_metadata_filename, guest_mount.ls("volumes"))
+        expected_auth_metadata_content = self._auth_metadata_get(self.mount_a.read_file("volumes/{0}".format(auth_metadata_filename)))
+
+        # Induce partial auth update state by modifying the auth metadata file,
+        # and then run authorize again.
+        guest_mount.run_shell(['sed', '-i', 's/false/true/g', 'volumes/{0}'.format(auth_metadata_filename)], sudo=True)
+
+        # Authorize 'guestclient_1' to access the subvolume.
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+        auth_metadata_content = self._auth_metadata_get(self.mount_a.read_file("volumes/{0}".format(auth_metadata_filename)))
+        self.assertEqual(auth_metadata_content, expected_auth_metadata_content)
+
+        # clean up
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, auth_id, "--group_name", group)
+        guest_mount.umount_wait()
+        self.fs.mon_manager.raw_cluster_cmd("auth", "rm", "client.guest1")
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_recover_auth_metadata_during_deauthorize(self):
+        """
+        That auth metadata manager can recover from partial auth updates using
+        metadata files, which store auth info and its update status info. This
+        test validates the recovery during deauthorize.
+        """
+
+        guest_mount = self.mount_b
+
+        subvolume1, subvolume2 = self._generate_random_subvolume_name(2)
+        group = self._generate_random_group_name()
+
+        guestclient_1 = {
+            "auth_id": "guest1",
+            "tenant_id": "tenant1",
+        }
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolumes in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume1, "--group_name", group)
+        self._fs_cmd("subvolume", "create", self.volname, subvolume2, "--group_name", group)
+
+        # Authorize 'guestclient_1' to access the subvolume1.
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume1, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+        # Check that auth metadata file for auth ID 'guest1', is
+        # created on authorizing 'guest1' access to the subvolume1.
+        auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"])
+        self.assertIn(auth_metadata_filename, guest_mount.ls("volumes"))
+        expected_auth_metadata_content = self._auth_metadata_get(self.mount_a.read_file("volumes/{0}".format(auth_metadata_filename)))
+
+        # Authorize 'guestclient_1' to access the subvolume2.
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume2, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+        # Induce partial auth update state by modifying the auth metadata file,
+        # and then run de-authorize.
+        guest_mount.run_shell(['sed', '-i', 's/false/true/g', 'volumes/{0}'.format(auth_metadata_filename)], sudo=True)
+
+        # Deauthorize 'guestclient_1' to access the subvolume2.
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume2, guestclient_1["auth_id"],
+                     "--group_name", group)
+
+        auth_metadata_content = self._auth_metadata_get(self.mount_a.read_file("volumes/{0}".format(auth_metadata_filename)))
+        self.assertEqual(auth_metadata_content, expected_auth_metadata_content)
+
+        # clean up
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume1, "guest1", "--group_name", group)
+        guest_mount.umount_wait()
+        self.fs.mon_manager.raw_cluster_cmd("auth", "rm", "client.guest1")
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume1, "--group_name", group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume2, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_update_old_style_auth_metadata_to_new_during_authorize(self):
+        """
+        CephVolumeClient stores the subvolume data in auth metadata file with
+        'volumes' key as there was no subvolume namespace. It doesn't makes sense
+        with mgr/volumes. This test validates the transparent update of 'volumes'
+        key to 'subvolumes' key in auth metadata file during authorize.
+        """
+
+        guest_mount = self.mount_b
+
+        subvolume1, subvolume2 = self._generate_random_subvolume_name(2)
+        group = self._generate_random_group_name()
+
+        auth_id = "guest1"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolumes in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume1, "--group_name", group)
+        self._fs_cmd("subvolume", "create", self.volname, subvolume2, "--group_name", group)
+
+        # Authorize 'guestclient_1' to access the subvolume1.
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume1, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+        # Check that auth metadata file for auth ID 'guest1', is
+        # created on authorizing 'guest1' access to the subvolume1.
+        auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"])
+        self.assertIn(auth_metadata_filename, guest_mount.ls("volumes"))
+
+        # Replace 'subvolumes' to 'volumes', old style auth-metadata file
+        guest_mount.run_shell(['sed', '-i', 's/subvolumes/volumes/g', 'volumes/{0}'.format(auth_metadata_filename)], sudo=True)
+
+        # Authorize 'guestclient_1' to access the subvolume2. This should transparently update 'volumes' to 'subvolumes'
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume2, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+        expected_auth_metadata = {
+            "version": 5,
+            "compat_version": 6,
+            "dirty": False,
+            "tenant_id": "tenant1",
+            "subvolumes": {
+                "{0}/{1}".format(group,subvolume1): {
+                    "dirty": False,
+                    "access_level": "rw"
+                },
+                "{0}/{1}".format(group,subvolume2): {
+                    "dirty": False,
+                    "access_level": "rw"
+                }
+            }
+        }
+
+        auth_metadata = self._auth_metadata_get(guest_mount.read_file("volumes/{0}".format(auth_metadata_filename)))
+
+        self.assertGreaterEqual(auth_metadata["version"], expected_auth_metadata["version"])
+        del expected_auth_metadata["version"]
+        del auth_metadata["version"]
+        self.assertEqual(expected_auth_metadata, auth_metadata)
+
+        # clean up
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume1, auth_id, "--group_name", group)
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume2, auth_id, "--group_name", group)
+        guest_mount.umount_wait()
+        self.fs.mon_manager.raw_cluster_cmd("auth", "rm", "client.guest1")
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume1, "--group_name", group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume2, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_update_old_style_auth_metadata_to_new_during_deauthorize(self):
+        """
+        CephVolumeClient stores the subvolume data in auth metadata file with
+        'volumes' key as there was no subvolume namespace. It doesn't makes sense
+        with mgr/volumes. This test validates the transparent update of 'volumes'
+        key to 'subvolumes' key in auth metadata file during deauthorize.
+        """
+
+        guest_mount = self.mount_b
+
+        subvolume1, subvolume2 = self._generate_random_subvolume_name(2)
+        group = self._generate_random_group_name()
+
+        auth_id = "guest1"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolumes in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume1, "--group_name", group)
+        self._fs_cmd("subvolume", "create", self.volname, subvolume2, "--group_name", group)
+
+        # Authorize 'guestclient_1' to access the subvolume1.
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume1, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+        # Authorize 'guestclient_1' to access the subvolume2.
+        self._fs_cmd("subvolume", "authorize", self.volname, subvolume2, guestclient_1["auth_id"],
+                     "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+        # Check that auth metadata file for auth ID 'guest1', is created.
+        auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"])
+        self.assertIn(auth_metadata_filename, guest_mount.ls("volumes"))
+
+        # Replace 'subvolumes' to 'volumes', old style auth-metadata file
+        guest_mount.run_shell(['sed', '-i', 's/subvolumes/volumes/g', 'volumes/{0}'.format(auth_metadata_filename)], sudo=True)
+
+        # Deauthorize 'guestclient_1' to access the subvolume2. This should update 'volumes' to subvolumes'
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume2, auth_id, "--group_name", group)
+
+        expected_auth_metadata = {
+            "version": 5,
+            "compat_version": 6,
+            "dirty": False,
+            "tenant_id": "tenant1",
+            "subvolumes": {
+                "{0}/{1}".format(group,subvolume1): {
+                    "dirty": False,
+                    "access_level": "rw"
+                }
+            }
+        }
+
+        auth_metadata = self._auth_metadata_get(guest_mount.read_file("volumes/{0}".format(auth_metadata_filename)))
+
+        self.assertGreaterEqual(auth_metadata["version"], expected_auth_metadata["version"])
+        del expected_auth_metadata["version"]
+        del auth_metadata["version"]
+        self.assertEqual(expected_auth_metadata, auth_metadata)
+
+        # clean up
+        self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume1, auth_id, "--group_name", group)
+        guest_mount.umount_wait()
+        self.fs.mon_manager.raw_cluster_cmd("auth", "rm", "client.guest1")
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume1, "--group_name", group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume2, "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_evict_client(self):
+        """
+        That a subvolume client can be evicted based on the auth ID
+        """
+
+        subvolumes = self._generate_random_subvolume_name(2)
+        group = self._generate_random_group_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # mounts[0] and mounts[1] would be used as guests to mount the volumes/shares.
+        for i in range(0, 2):
+            self.mounts[i].umount_wait()
+        guest_mounts = (self.mounts[0], self.mounts[1])
+        auth_id = "guest"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+
+        # Create two subvolumes. Authorize 'guest' auth ID to mount the two
+        # subvolumes. Mount the two subvolumes. Write data to the volumes.
+        for i in range(2):
+            # Create subvolume.
+            self._fs_cmd("subvolume", "create", self.volname, subvolumes[i], "--group_name", group, "--mode=777")
+
+            # authorize guest authID read-write access to subvolume
+            key = self._fs_cmd("subvolume", "authorize", self.volname, subvolumes[i], guestclient_1["auth_id"],
+                               "--group_name", group, "--tenant_id", guestclient_1["tenant_id"])
+
+            mount_path = self._fs_cmd("subvolume", "getpath", self.volname, subvolumes[i],
+                                      "--group_name", group).rstrip()
+            # configure credentials for guest client
+            self._configure_guest_auth(guest_mounts[i], auth_id, key)
+
+            # mount the subvolume, and write to it
+            guest_mounts[i].mount_wait(cephfs_mntpt=mount_path)
+            guest_mounts[i].write_n_mb("data.bin", 1)
+
+        # Evict client, guest_mounts[0], using auth ID 'guest' and has mounted
+        # one volume.
+        self._fs_cmd("subvolume", "evict", self.volname, subvolumes[0], auth_id, "--group_name", group)
+
+        # Evicted guest client, guest_mounts[0], should not be able to do
+        # anymore metadata ops.  It should start failing all operations
+        # when it sees that its own address is in the blocklist.
+        try:
+            guest_mounts[0].write_n_mb("rogue.bin", 1)
+        except CommandFailedError:
+            pass
+        else:
+            raise RuntimeError("post-eviction write should have failed!")
+
+        # The blocklisted guest client should now be unmountable
+        guest_mounts[0].umount_wait()
+
+        # Guest client, guest_mounts[1], using the same auth ID 'guest', but
+        # has mounted the other volume, should be able to use its volume
+        # unaffected.
+        guest_mounts[1].write_n_mb("data.bin.1", 1)
+
+        # Cleanup.
+        guest_mounts[1].umount_wait()
+        for i in range(2):
+            self._fs_cmd("subvolume", "deauthorize", self.volname, subvolumes[i], auth_id, "--group_name", group)
+            self._fs_cmd("subvolume", "rm", self.volname, subvolumes[i], "--group_name", group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_pin_random(self):
+        self.fs.set_max_mds(2)
+        self.fs.wait_for_daemons()
+        self.config_set('mds', 'mds_export_ephemeral_random', True)
+
+        subvolume = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+        self._fs_cmd("subvolume", "pin", self.volname, subvolume, "random", ".01")
+        # no verification
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_resize_fail_invalid_size(self):
+        """
+        That a subvolume cannot be resized to an invalid size and the quota did not change
+        """
+
+        osize = self.DEFAULT_FILE_SIZE*1024*1024
+        # create subvolume
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", str(osize))
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname)
+        self.assertNotEqual(subvolpath, None)
+
+        # try to resize the subvolume with an invalid size -10
+        nsize = -10
+        try:
+            self._fs_cmd("subvolume", "resize", self.volname, subvolname, str(nsize))
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on resize of subvolume with invalid size")
+        else:
+            self.fail("expected the 'fs subvolume resize' command to fail")
+
+        # verify the quota did not change
+        size = int(self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes"))
+        self.assertEqual(size, osize)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_resize_fail_zero_size(self):
+        """
+        That a subvolume cannot be resized to a zero size and the quota did not change
+        """
+
+        osize = self.DEFAULT_FILE_SIZE*1024*1024
+        # create subvolume
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", str(osize))
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname)
+        self.assertNotEqual(subvolpath, None)
+
+        # try to resize the subvolume with size 0
+        nsize = 0
+        try:
+            self._fs_cmd("subvolume", "resize", self.volname, subvolname, str(nsize))
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on resize of subvolume with invalid size")
+        else:
+            self.fail("expected the 'fs subvolume resize' command to fail")
+
+        # verify the quota did not change
+        size = int(self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes"))
+        self.assertEqual(size, osize)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_resize_quota_lt_used_size(self):
+        """
+        That a subvolume can be resized to a size smaller than the current used size
+        and the resulting quota matches the expected size.
+        """
+
+        osize = self.DEFAULT_FILE_SIZE*1024*1024*20
+        # create subvolume
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", str(osize), "--mode=777")
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname)
+        self.assertNotEqual(subvolpath, None)
+
+        # create one file of 10MB
+        file_size=self.DEFAULT_FILE_SIZE*10
+        number_of_files=1
+        log.debug("filling subvolume {0} with {1} file of size {2}MB".format(subvolname,
+                                                                             number_of_files,
+                                                                             file_size))
+        filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, self.DEFAULT_NUMBER_OF_FILES+1)
+        self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size)
+
+        usedsize = int(self.mount_a.getfattr(subvolpath, "ceph.dir.rbytes"))
+        susedsize = int(self.mount_a.run_shell(['stat', '-c' '%s', subvolpath]).stdout.getvalue().strip())
+        if isinstance(self.mount_a, FuseMount):
+            # kclient dir does not have size==rbytes
+            self.assertEqual(usedsize, susedsize)
+
+        # shrink the subvolume
+        nsize = usedsize // 2
+        try:
+            self._fs_cmd("subvolume", "resize", self.volname, subvolname, str(nsize))
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume resize' command to succeed")
+
+        # verify the quota
+        size = int(self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes"))
+        self.assertEqual(size, nsize)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_resize_fail_quota_lt_used_size_no_shrink(self):
+        """
+        That a subvolume cannot be resized to a size smaller than the current used size
+        when --no_shrink is given and the quota did not change.
+        """
+
+        osize = self.DEFAULT_FILE_SIZE*1024*1024*20
+        # create subvolume
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", str(osize), "--mode=777")
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname)
+        self.assertNotEqual(subvolpath, None)
+
+        # create one file of 10MB
+        file_size=self.DEFAULT_FILE_SIZE*10
+        number_of_files=1
+        log.debug("filling subvolume {0} with {1} file of size {2}MB".format(subvolname,
+                                                                             number_of_files,
+                                                                             file_size))
+        filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, self.DEFAULT_NUMBER_OF_FILES+2)
+        self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size)
+
+        usedsize = int(self.mount_a.getfattr(subvolpath, "ceph.dir.rbytes"))
+        susedsize = int(self.mount_a.run_shell(['stat', '-c' '%s', subvolpath]).stdout.getvalue().strip())
+        if isinstance(self.mount_a, FuseMount):
+            # kclient dir does not have size==rbytes
+            self.assertEqual(usedsize, susedsize)
+
+        # shrink the subvolume
+        nsize = usedsize // 2
+        try:
+            self._fs_cmd("subvolume", "resize", self.volname, subvolname, str(nsize), "--no_shrink")
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on resize of subvolume with invalid size")
+        else:
+            self.fail("expected the 'fs subvolume resize' command to fail")
+
+        # verify the quota did not change
+        size = int(self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes"))
+        self.assertEqual(size, osize)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_resize_expand_on_full_subvolume(self):
+        """
+        That the subvolume can be expanded from a full subvolume and future writes succeed.
+        """
+
+        osize = self.DEFAULT_FILE_SIZE*1024*1024*10
+        # create subvolume of quota 10MB and make sure it exists
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", str(osize), "--mode=777")
+        subvolpath = self._get_subvolume_path(self.volname, subvolname)
+        self.assertNotEqual(subvolpath, None)
+
+        # create one file of size 10MB and write
+        file_size=self.DEFAULT_FILE_SIZE*10
+        number_of_files=1
+        log.debug("filling subvolume {0} with {1} file of size {2}MB".format(subvolname,
+                                                                             number_of_files,
+                                                                             file_size))
+        filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, self.DEFAULT_NUMBER_OF_FILES+3)
+        self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size)
+
+        # create a file of size 5MB and try write more
+        file_size=file_size // 2
+        number_of_files=1
+        log.debug("filling subvolume {0} with {1} file of size {2}MB".format(subvolname,
+                                                                             number_of_files,
+                                                                             file_size))
+        filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, self.DEFAULT_NUMBER_OF_FILES+4)
+        try:
+            self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size)
+        except CommandFailedError:
+            # Not able to write. So expand the subvolume more and try writing the 5MB file again
+            nsize = osize*2
+            self._fs_cmd("subvolume", "resize", self.volname, subvolname, str(nsize))
+            try:
+                self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size)
+            except CommandFailedError:
+                self.fail("expected filling subvolume {0} with {1} file of size {2}MB"
+                                   "to succeed".format(subvolname, number_of_files, file_size))
+        else:
+            self.fail("expected filling subvolume {0} with {1} file of size {2}MB"
+                               "to fail".format(subvolname, number_of_files, file_size))
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_resize_infinite_size(self):
+        """
+        That a subvolume can be resized to an infinite size by unsetting its quota.
+        """
+
+        # create subvolume
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size",
+                     str(self.DEFAULT_FILE_SIZE*1024*1024))
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname)
+        self.assertNotEqual(subvolpath, None)
+
+        # resize inf
+        self._fs_cmd("subvolume", "resize", self.volname, subvolname, "inf")
+
+        # verify that the quota is None
+        size = self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes")
+        self.assertEqual(size, None)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_resize_infinite_size_future_writes(self):
+        """
+        That a subvolume can be resized to an infinite size and the future writes succeed.
+        """
+
+        # create subvolume
+        subvolname = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size",
+                     str(self.DEFAULT_FILE_SIZE*1024*1024*5), "--mode=777")
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname)
+        self.assertNotEqual(subvolpath, None)
+
+        # resize inf
+        self._fs_cmd("subvolume", "resize", self.volname, subvolname, "inf")
+
+        # verify that the quota is None
+        size = self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes")
+        self.assertEqual(size, None)
+
+        # create one file of 10MB and try to write
+        file_size=self.DEFAULT_FILE_SIZE*10
+        number_of_files=1
+        log.debug("filling subvolume {0} with {1} file of size {2}MB".format(subvolname,
+                                                                             number_of_files,
+                                                                             file_size))
+        filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, self.DEFAULT_NUMBER_OF_FILES+5)
+
+        try:
+            self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size)
+        except CommandFailedError:
+            self.fail("expected filling subvolume {0} with {1} file of size {2}MB "
+                               "to succeed".format(subvolname, number_of_files, file_size))
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_rm_force(self):
+        # test removing non-existing subvolume with --force
+        subvolume = self._generate_random_subvolume_name()
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--force")
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume rm --force' command to succeed")
+
+    def test_subvolume_exists_with_subvolumegroup_and_subvolume(self):
+        """Test the presence of any subvolume by specifying the name of subvolumegroup"""
+
+        group = self._generate_random_group_name()
+        subvolume1 = self._generate_random_subvolume_name()
+        # create subvolumegroup
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume1, "--group_name", group)
+        ret = self._fs_cmd("subvolume", "exist", self.volname, "--group_name", group)
+        self.assertEqual(ret.strip('\n'), "subvolume exists")
+        # delete subvolume in group
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume1, "--group_name", group)
+        ret = self._fs_cmd("subvolume", "exist", self.volname, "--group_name", group)
+        self.assertEqual(ret.strip('\n'), "no subvolume exists")
+        # delete subvolumegroup
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_exists_with_subvolumegroup_and_no_subvolume(self):
+        """Test the presence of any subvolume specifying the name
+            of subvolumegroup and no subvolumes"""
+
+        group = self._generate_random_group_name()
+        # create subvolumegroup
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+        ret = self._fs_cmd("subvolume", "exist", self.volname, "--group_name", group)
+        self.assertEqual(ret.strip('\n'), "no subvolume exists")
+        # delete subvolumegroup
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_exists_without_subvolumegroup_and_with_subvolume(self):
+        """Test the presence of any subvolume without specifying the name
+            of subvolumegroup"""
+
+        subvolume1 = self._generate_random_subvolume_name()
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume1)
+        ret = self._fs_cmd("subvolume", "exist", self.volname)
+        self.assertEqual(ret.strip('\n'), "subvolume exists")
+        # delete subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume1)
+        ret = self._fs_cmd("subvolume", "exist", self.volname)
+        self.assertEqual(ret.strip('\n'), "no subvolume exists")
+
+    def test_subvolume_exists_without_subvolumegroup_and_without_subvolume(self):
+        """Test the presence of any subvolume without any subvolumegroup
+            and without any subvolume"""
+
+        ret = self._fs_cmd("subvolume", "exist", self.volname)
+        self.assertEqual(ret.strip('\n'), "no subvolume exists")
+
+    def test_subvolume_shrink(self):
+        """
+        That a subvolume can be shrinked in size and its quota matches the expected size.
+        """
+
+        # create subvolume
+        subvolname = self._generate_random_subvolume_name()
+        osize = self.DEFAULT_FILE_SIZE*1024*1024
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", str(osize))
+
+        # make sure it exists
+        subvolpath = self._get_subvolume_path(self.volname, subvolname)
+        self.assertNotEqual(subvolpath, None)
+
+        # shrink the subvolume
+        nsize = osize // 2
+        self._fs_cmd("subvolume", "resize", self.volname, subvolname, str(nsize))
+
+        # verify the quota
+        size = int(self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes"))
+        self.assertEqual(size, nsize)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_retain_snapshot_rm_idempotency(self):
+        """
+        ensure subvolume deletion of a subvolume which is already deleted with retain snapshots option passes.
+        After subvolume deletion with retain snapshots, the subvolume exists until the trash directory (resides inside subvolume)
+        is cleaned up. The subvolume deletion issued while the trash directory is not empty, should pass and should
+        not error out with EAGAIN.
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=256)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # remove with snapshot retention
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # remove snapshots (removes retained volume)
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolume (check idempotency)
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOENT:
+                self.fail(f"expected subvolume rm to pass with error: {os.strerror(ce.exitstatus)}")
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+
+    def test_subvolume_user_metadata_set(self):
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group)
+
+        # set metadata for subvolume.
+        key = "key"
+        value = "value"
+        try:
+            self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume metadata set' command to succeed")
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_user_metadata_set_idempotence(self):
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group)
+
+        # set metadata for subvolume.
+        key = "key"
+        value = "value"
+        try:
+            self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume metadata set' command to succeed")
+
+        # set same metadata again for subvolume.
+        try:
+            self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume metadata set' command to succeed because it is idempotent operation")
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_user_metadata_get(self):
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group)
+
+        # set metadata for subvolume.
+        key = "key"
+        value = "value"
+        self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group)
+
+        # get value for specified key.
+        try:
+            ret = self._fs_cmd("subvolume", "metadata", "get", self.volname, subvolname, key, "--group_name", group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume metadata get' command to succeed")
+
+        # remove '\n' from returned value.
+        ret = ret.strip('\n')
+
+        # match received value with expected value.
+        self.assertEqual(value, ret)
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_user_metadata_get_for_nonexisting_key(self):
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group)
+
+        # set metadata for subvolume.
+        key = "key"
+        value = "value"
+        self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group)
+
+        # try to get value for nonexisting key
+        # Expecting ENOENT exit status because key does not exist
+        try:
+            self._fs_cmd("subvolume", "metadata", "get", self.volname, subvolname, "key_nonexist", "--group_name", group)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.ENOENT)
+        else:
+            self.fail("Expected ENOENT because 'key_nonexist' does not exist")
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_user_metadata_get_for_nonexisting_section(self):
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group)
+
+        # try to get value for nonexisting key (as section does not exist)
+        # Expecting ENOENT exit status because key does not exist
+        try:
+            self._fs_cmd("subvolume", "metadata", "get", self.volname, subvolname, "key", "--group_name", group)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.ENOENT)
+        else:
+            self.fail("Expected ENOENT because section does not exist")
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_user_metadata_update(self):
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group)
+
+        # set metadata for subvolume.
+        key = "key"
+        value = "value"
+        self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group)
+
+        # update metadata against key.
+        new_value = "new_value"
+        self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, new_value, "--group_name", group)
+
+        # get metadata for specified key of subvolume.
+        try:
+            ret = self._fs_cmd("subvolume", "metadata", "get", self.volname, subvolname, key, "--group_name", group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume metadata get' command to succeed")
+
+        # remove '\n' from returned value.
+        ret = ret.strip('\n')
+
+        # match received value with expected value.
+        self.assertEqual(new_value, ret)
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_user_metadata_list(self):
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group)
+
+        # set metadata for subvolume.
+        input_metadata_dict =  {f'key_{i}' : f'value_{i}' for i in range(3)}
+
+        for k, v in input_metadata_dict.items():
+            self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, k, v, "--group_name", group)
+
+        # list metadata
+        try:
+            ret = self._fs_cmd("subvolume", "metadata", "ls", self.volname, subvolname, "--group_name", group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume metadata ls' command to succeed")
+
+        ret_dict = json.loads(ret)
+
+        # compare output with expected output
+        self.assertDictEqual(input_metadata_dict, ret_dict)
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_user_metadata_list_if_no_metadata_set(self):
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group)
+
+        # list metadata
+        try:
+            ret = self._fs_cmd("subvolume", "metadata", "ls", self.volname, subvolname, "--group_name", group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume metadata ls' command to succeed")
+
+        # remove '\n' from returned value.
+        ret = ret.strip('\n')
+
+        # compare output with expected output
+        # expecting empty json/dictionary
+        self.assertEqual(ret, "{}")
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_user_metadata_remove(self):
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group)
+
+        # set metadata for subvolume.
+        key = "key"
+        value = "value"
+        self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group)
+
+        # remove metadata against specified key.
+        try:
+            self._fs_cmd("subvolume", "metadata", "rm", self.volname, subvolname, key, "--group_name", group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume metadata rm' command to succeed")
+
+        # confirm key is removed by again fetching metadata
+        try:
+            self._fs_cmd("subvolume", "metadata", "get", self.volname, subvolname, key, "--group_name", group)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.ENOENT)
+        else:
+            self.fail("Expected ENOENT because key does not exist")
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_user_metadata_remove_for_nonexisting_key(self):
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group)
+
+        # set metadata for subvolume.
+        key = "key"
+        value = "value"
+        self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group)
+
+        # try to remove value for nonexisting key
+        # Expecting ENOENT exit status because key does not exist
+        try:
+            self._fs_cmd("subvolume", "metadata", "rm", self.volname, subvolname, "key_nonexist", "--group_name", group)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.ENOENT)
+        else:
+            self.fail("Expected ENOENT because 'key_nonexist' does not exist")
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_user_metadata_remove_for_nonexisting_section(self):
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group)
+
+        # try to remove value for nonexisting key (as section does not exist)
+        # Expecting ENOENT exit status because key does not exist
+        try:
+            self._fs_cmd("subvolume", "metadata", "rm", self.volname, subvolname, "key", "--group_name", group)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.ENOENT)
+        else:
+            self.fail("Expected ENOENT because section does not exist")
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_user_metadata_remove_force(self):
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group)
+
+        # set metadata for subvolume.
+        key = "key"
+        value = "value"
+        self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group)
+
+        # remove metadata against specified key with --force option.
+        try:
+            self._fs_cmd("subvolume", "metadata", "rm", self.volname, subvolname, key, "--group_name", group, "--force")
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume metadata rm' command to succeed")
+
+        # confirm key is removed by again fetching metadata
+        try:
+            self._fs_cmd("subvolume", "metadata", "get", self.volname, subvolname, key, "--group_name", group)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.ENOENT)
+        else:
+            self.fail("Expected ENOENT because key does not exist")
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_user_metadata_remove_force_for_nonexisting_key(self):
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group)
+
+        # set metadata for subvolume.
+        key = "key"
+        value = "value"
+        self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group)
+
+        # remove metadata against specified key.
+        try:
+            self._fs_cmd("subvolume", "metadata", "rm", self.volname, subvolname, key, "--group_name", group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume metadata rm' command to succeed")
+
+        # confirm key is removed by again fetching metadata
+        try:
+            self._fs_cmd("subvolume", "metadata", "get", self.volname, subvolname, key, "--group_name", group)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.ENOENT)
+        else:
+            self.fail("Expected ENOENT because key does not exist")
+
+        # again remove metadata against already removed key with --force option.
+        try:
+            self._fs_cmd("subvolume", "metadata", "rm", self.volname, subvolname, key, "--group_name", group, "--force")
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume metadata rm' (with --force) command to succeed")
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_user_metadata_set_and_get_for_legacy_subvolume(self):
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # emulate a old-fashioned subvolume in a custom group
+        createpath = os.path.join(".", "volumes", group, subvolname)
+        self.mount_a.run_shell(['mkdir', '-p', createpath], sudo=True)
+
+        # set metadata for subvolume.
+        key = "key"
+        value = "value"
+        try:
+            self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume metadata set' command to succeed")
+
+        # get value for specified key.
+        try:
+            ret = self._fs_cmd("subvolume", "metadata", "get", self.volname, subvolname, key, "--group_name", group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume metadata get' command to succeed")
+
+        # remove '\n' from returned value.
+        ret = ret.strip('\n')
+
+        # match received value with expected value.
+        self.assertEqual(value, ret)
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_user_metadata_list_and_remove_for_legacy_subvolume(self):
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # emulate a old-fashioned subvolume in a custom group
+        createpath = os.path.join(".", "volumes", group, subvolname)
+        self.mount_a.run_shell(['mkdir', '-p', createpath], sudo=True)
+
+        # set metadata for subvolume.
+        input_metadata_dict =  {f'key_{i}' : f'value_{i}' for i in range(3)}
+
+        for k, v in input_metadata_dict.items():
+            self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, k, v, "--group_name", group)
+
+        # list metadata
+        try:
+            ret = self._fs_cmd("subvolume", "metadata", "ls", self.volname, subvolname, "--group_name", group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume metadata ls' command to succeed")
+
+        ret_dict = json.loads(ret)
+
+        # compare output with expected output
+        self.assertDictEqual(input_metadata_dict, ret_dict)
+
+        # remove metadata against specified key.
+        try:
+            self._fs_cmd("subvolume", "metadata", "rm", self.volname, subvolname, "key_1", "--group_name", group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume metadata rm' command to succeed")
+
+        # confirm key is removed by again fetching metadata
+        try:
+            self._fs_cmd("subvolume", "metadata", "get", self.volname, subvolname, "key_1", "--group_name", group)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.ENOENT)
+        else:
+            self.fail("Expected ENOENT because key_1 does not exist")
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+class TestSubvolumeGroupSnapshots(TestVolumesHelper):
+    """Tests for FS subvolume group snapshot operations."""
+    @unittest.skip("skipping subvolumegroup snapshot tests")
+    def test_nonexistent_subvolume_group_snapshot_rm(self):
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # snapshot group
+        self._fs_cmd("subvolumegroup", "snapshot", "create", self.volname, group, snapshot)
+
+        # remove snapshot
+        self._fs_cmd("subvolumegroup", "snapshot", "rm", self.volname, group, snapshot)
+
+        # remove snapshot
+        try:
+            self._fs_cmd("subvolumegroup", "snapshot", "rm", self.volname, group, snapshot)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOENT:
+                raise
+        else:
+            raise RuntimeError("expected the 'fs subvolumegroup snapshot rm' command to fail")
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    @unittest.skip("skipping subvolumegroup snapshot tests")
+    def test_subvolume_group_snapshot_create_and_rm(self):
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # snapshot group
+        self._fs_cmd("subvolumegroup", "snapshot", "create", self.volname, group, snapshot)
+
+        # remove snapshot
+        self._fs_cmd("subvolumegroup", "snapshot", "rm", self.volname, group, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    @unittest.skip("skipping subvolumegroup snapshot tests")
+    def test_subvolume_group_snapshot_idempotence(self):
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # snapshot group
+        self._fs_cmd("subvolumegroup", "snapshot", "create", self.volname, group, snapshot)
+
+        # try creating snapshot w/ same snapshot name -- shoule be idempotent
+        self._fs_cmd("subvolumegroup", "snapshot", "create", self.volname, group, snapshot)
+
+        # remove snapshot
+        self._fs_cmd("subvolumegroup", "snapshot", "rm", self.volname, group, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    @unittest.skip("skipping subvolumegroup snapshot tests")
+    def test_subvolume_group_snapshot_ls(self):
+        # tests the 'fs subvolumegroup snapshot ls' command
+
+        snapshots = []
+
+        # create group
+        group = self._generate_random_group_name()
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolumegroup snapshots
+        snapshots = self._generate_random_snapshot_name(3)
+        for snapshot in snapshots:
+            self._fs_cmd("subvolumegroup", "snapshot", "create", self.volname, group, snapshot)
+
+        subvolgrpsnapshotls = json.loads(self._fs_cmd('subvolumegroup', 'snapshot', 'ls', self.volname, group))
+        if len(subvolgrpsnapshotls) == 0:
+            raise RuntimeError("Expected the 'fs subvolumegroup snapshot ls' command to list the created subvolume group snapshots")
+        else:
+            snapshotnames = [snapshot['name'] for snapshot in subvolgrpsnapshotls]
+            if collections.Counter(snapshotnames) != collections.Counter(snapshots):
+                raise RuntimeError("Error creating or listing subvolume group snapshots")
+
+    @unittest.skip("skipping subvolumegroup snapshot tests")
+    def test_subvolume_group_snapshot_rm_force(self):
+        # test removing non-existing subvolume group snapshot with --force
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+        # remove snapshot
+        try:
+            self._fs_cmd("subvolumegroup", "snapshot", "rm", self.volname, group, snapshot, "--force")
+        except CommandFailedError:
+            raise RuntimeError("expected the 'fs subvolumegroup snapshot rm --force' command to succeed")
+
+    def test_subvolume_group_snapshot_unsupported_status(self):
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # snapshot group
+        try:
+            self._fs_cmd("subvolumegroup", "snapshot", "create", self.volname, group, snapshot)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOSYS, "invalid error code on subvolumegroup snapshot create")
+        else:
+            self.fail("expected subvolumegroup snapshot create command to fail")
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+
+class TestSubvolumeSnapshots(TestVolumesHelper):
+    """Tests for FS subvolume snapshot operations."""
+    def test_nonexistent_subvolume_snapshot_rm(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove snapshot again
+        try:
+            self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOENT:
+                raise
+        else:
+            raise RuntimeError("expected the 'fs subvolume snapshot rm' command to fail")
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_create_and_rm(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_create_idempotence(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # try creating w/ same subvolume snapshot name -- should be idempotent
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_info(self):
+
+        """
+        tests the 'fs subvolume snapshot info' command
+        """
+
+        snap_md = ["created_at", "data_pool", "has_pending_clones"]
+
+        subvolume = self._generate_random_subvolume_name()
+        snapshot, snap_missing = self._generate_random_snapshot_name(2)
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=1)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        snap_info = json.loads(self._get_subvolume_snapshot_info(self.volname, subvolume, snapshot))
+        for md in snap_md:
+            self.assertIn(md, snap_info, "'{0}' key not present in metadata of snapshot".format(md))
+        self.assertEqual(snap_info["has_pending_clones"], "no")
+
+        # snapshot info for non-existent snapshot
+        try:
+            self._get_subvolume_snapshot_info(self.volname, subvolume, snap_missing)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on snapshot info of non-existent snapshot")
+        else:
+            self.fail("expected snapshot info of non-existent snapshot to fail")
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_in_group(self):
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # snapshot subvolume in group
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot, group)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot, group)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_snapshot_ls(self):
+        # tests the 'fs subvolume snapshot ls' command
+
+        snapshots = []
+
+        # create subvolume
+        subvolume = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # create subvolume snapshots
+        snapshots = self._generate_random_snapshot_name(3)
+        for snapshot in snapshots:
+            self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        subvolsnapshotls = json.loads(self._fs_cmd('subvolume', 'snapshot', 'ls', self.volname, subvolume))
+        if len(subvolsnapshotls) == 0:
+            self.fail("Expected the 'fs subvolume snapshot ls' command to list the created subvolume snapshots")
+        else:
+            snapshotnames = [snapshot['name'] for snapshot in subvolsnapshotls]
+            if collections.Counter(snapshotnames) != collections.Counter(snapshots):
+                self.fail("Error creating or listing subvolume snapshots")
+
+        # remove snapshot
+        for snapshot in snapshots:
+            self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_inherited_snapshot_ls(self):
+        # tests the scenario where 'fs subvolume snapshot ls' command
+        # should not list inherited snapshots created as part of snapshot
+        # at ancestral level
+
+        snapshots = []
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snap_count = 3
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # create subvolume snapshots
+        snapshots = self._generate_random_snapshot_name(snap_count)
+        for snapshot in snapshots:
+            self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot, group)
+
+        # Create snapshot at ancestral level
+        ancestral_snappath1 = os.path.join(".", "volumes", group, ".snap", "ancestral_snap_1")
+        ancestral_snappath2 = os.path.join(".", "volumes", group, ".snap", "ancestral_snap_2")
+        self.mount_a.run_shell(['mkdir', '-p', ancestral_snappath1, ancestral_snappath2], sudo=True)
+
+        subvolsnapshotls = json.loads(self._fs_cmd('subvolume', 'snapshot', 'ls', self.volname, subvolume, group))
+        self.assertEqual(len(subvolsnapshotls), snap_count)
+
+        # remove ancestral snapshots
+        self.mount_a.run_shell(['rmdir', ancestral_snappath1, ancestral_snappath2], sudo=True)
+
+        # remove snapshot
+        for snapshot in snapshots:
+            self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot, group)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_inherited_snapshot_info(self):
+        """
+        tests the scenario where 'fs subvolume snapshot info' command
+        should fail for inherited snapshots created as part of snapshot
+        at ancestral level
+        """
+
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # Create snapshot at ancestral level
+        ancestral_snap_name = "ancestral_snap_1"
+        ancestral_snappath1 = os.path.join(".", "volumes", group, ".snap", ancestral_snap_name)
+        self.mount_a.run_shell(['mkdir', '-p', ancestral_snappath1], sudo=True)
+
+        # Validate existence of inherited snapshot
+        group_path = os.path.join(".", "volumes", group)
+        inode_number_group_dir = int(self.mount_a.run_shell(['stat', '-c' '%i', group_path]).stdout.getvalue().strip())
+        inherited_snap = "_{0}_{1}".format(ancestral_snap_name, inode_number_group_dir)
+        inherited_snappath = os.path.join(".", "volumes", group, subvolume,".snap", inherited_snap)
+        self.mount_a.run_shell(['ls', inherited_snappath])
+
+        # snapshot info on inherited snapshot
+        try:
+            self._get_subvolume_snapshot_info(self.volname, subvolume, inherited_snap, group)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on snapshot info of inherited snapshot")
+        else:
+            self.fail("expected snapshot info of inherited snapshot to fail")
+
+        # remove ancestral snapshots
+        self.mount_a.run_shell(['rmdir', ancestral_snappath1], sudo=True)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_inherited_snapshot_rm(self):
+        """
+        tests the scenario where 'fs subvolume snapshot rm' command
+        should fail for inherited snapshots created as part of snapshot
+        at ancestral level
+        """
+
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # Create snapshot at ancestral level
+        ancestral_snap_name = "ancestral_snap_1"
+        ancestral_snappath1 = os.path.join(".", "volumes", group, ".snap", ancestral_snap_name)
+        self.mount_a.run_shell(['mkdir', '-p', ancestral_snappath1], sudo=True)
+
+        # Validate existence of inherited snap
+        group_path = os.path.join(".", "volumes", group)
+        inode_number_group_dir = int(self.mount_a.run_shell(['stat', '-c' '%i', group_path]).stdout.getvalue().strip())
+        inherited_snap = "_{0}_{1}".format(ancestral_snap_name, inode_number_group_dir)
+        inherited_snappath = os.path.join(".", "volumes", group, subvolume,".snap", inherited_snap)
+        self.mount_a.run_shell(['ls', inherited_snappath])
+
+        # inherited snapshot should not be deletable
+        try:
+            self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, inherited_snap, "--group_name", group)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL, msg="invalid error code when removing inherited snapshot")
+        else:
+            self.fail("expected removing inheirted snapshot to fail")
+
+        # remove ancestral snapshots
+        self.mount_a.run_shell(['rmdir', ancestral_snappath1], sudo=True)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_subvolumegroup_snapshot_name_conflict(self):
+        """
+        tests the scenario where creation of subvolume snapshot name
+        with same name as it's subvolumegroup snapshot name. This should
+        fail.
+        """
+
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        group_snapshot = self._generate_random_snapshot_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group)
+
+        # Create subvolumegroup snapshot
+        group_snapshot_path = os.path.join(".", "volumes", group, ".snap", group_snapshot)
+        self.mount_a.run_shell(['mkdir', '-p', group_snapshot_path], sudo=True)
+
+        # Validate existence of subvolumegroup snapshot
+        self.mount_a.run_shell(['ls', group_snapshot_path])
+
+        # Creation of subvolume snapshot with it's subvolumegroup snapshot name should fail
+        try:
+            self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, group_snapshot, "--group_name", group)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL, msg="invalid error code when creating subvolume snapshot with same name as subvolume group snapshot")
+        else:
+            self.fail("expected subvolume snapshot creation with same name as subvolumegroup snapshot to fail")
+
+        # remove subvolumegroup snapshot
+        self.mount_a.run_shell(['rmdir', group_snapshot_path], sudo=True)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_retain_snapshot_invalid_recreate(self):
+        """
+        ensure retained subvolume recreate does not leave any incarnations in the subvolume and trash
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # remove with snapshot retention
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # recreate subvolume with an invalid pool
+        data_pool = "invalid_pool"
+        try:
+            self._fs_cmd("subvolume", "create", self.volname, subvolume, "--pool_layout", data_pool)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on recreate of subvolume with invalid poolname")
+        else:
+            self.fail("expected recreate of subvolume with invalid poolname to fail")
+
+        # fetch info
+        subvol_info = json.loads(self._fs_cmd("subvolume", "info", self.volname, subvolume))
+        self.assertEqual(subvol_info["state"], "snapshot-retained",
+                         msg="expected state to be 'snapshot-retained', found '{0}".format(subvol_info["state"]))
+
+        # getpath
+        try:
+            self._fs_cmd("subvolume", "getpath", self.volname, subvolume)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on getpath of subvolume with retained snapshots")
+        else:
+            self.fail("expected getpath of subvolume with retained snapshots to fail")
+
+        # remove snapshot (should remove volume)
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_retain_snapshot_recreate_subvolume(self):
+        """
+        ensure a retained subvolume can be recreated and further snapshotted
+        """
+        snap_md = ["created_at", "data_pool", "has_pending_clones"]
+
+        subvolume = self._generate_random_subvolume_name()
+        snapshot1, snapshot2 = self._generate_random_snapshot_name(2)
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot1)
+
+        # remove with snapshot retention
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # fetch info
+        subvol_info = json.loads(self._fs_cmd("subvolume", "info", self.volname, subvolume))
+        self.assertEqual(subvol_info["state"], "snapshot-retained",
+                         msg="expected state to be 'snapshot-retained', found '{0}".format(subvol_info["state"]))
+
+        # recreate retained subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # fetch info
+        subvol_info = json.loads(self._fs_cmd("subvolume", "info", self.volname, subvolume))
+        self.assertEqual(subvol_info["state"], "complete",
+                         msg="expected state to be 'snapshot-retained', found '{0}".format(subvol_info["state"]))
+
+        # snapshot info (older snapshot)
+        snap_info = json.loads(self._get_subvolume_snapshot_info(self.volname, subvolume, snapshot1))
+        for md in snap_md:
+            self.assertIn(md, snap_info, "'{0}' key not present in metadata of snapshot".format(md))
+        self.assertEqual(snap_info["has_pending_clones"], "no")
+
+        # snap-create (new snapshot)
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot2)
+
+        # remove with retain snapshots
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # list snapshots
+        subvolsnapshotls = json.loads(self._fs_cmd('subvolume', 'snapshot', 'ls', self.volname, subvolume))
+        self.assertEqual(len(subvolsnapshotls), 2, "Expected the 'fs subvolume snapshot ls' command to list the"
+                         " created subvolume snapshots")
+        snapshotnames = [snapshot['name'] for snapshot in subvolsnapshotls]
+        for snap in [snapshot1, snapshot2]:
+            self.assertIn(snap, snapshotnames, "Missing snapshot '{0}' in snapshot list".format(snap))
+
+        # remove snapshots (should remove volume)
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot1)
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot2)
+
+        # verify list subvolumes returns an empty list
+        subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        self.assertEqual(len(subvolumels), 0)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_retain_snapshot_with_snapshots(self):
+        """
+        ensure retain snapshots based delete of a subvolume with snapshots retains the subvolume
+        also test allowed and dis-allowed operations on a retained subvolume
+        """
+        snap_md = ["created_at", "data_pool", "has_pending_clones"]
+
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # remove subvolume -- should fail with ENOTEMPTY since it has snapshots
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOTEMPTY, "invalid error code on rm of retained subvolume with snapshots")
+        else:
+            self.fail("expected rm of subvolume with retained snapshots to fail")
+
+        # remove with snapshot retention
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # fetch info
+        subvol_info = json.loads(self._fs_cmd("subvolume", "info", self.volname, subvolume))
+        self.assertEqual(subvol_info["state"], "snapshot-retained",
+                         msg="expected state to be 'snapshot-retained', found '{0}".format(subvol_info["state"]))
+
+        ## test allowed ops in retained state
+        # ls
+        subvolumes = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        self.assertEqual(len(subvolumes), 1, "subvolume ls count mismatch, expected '1', found {0}".format(len(subvolumes)))
+        self.assertEqual(subvolumes[0]['name'], subvolume,
+                         "subvolume name mismatch in ls output, expected '{0}', found '{1}'".format(subvolume, subvolumes[0]['name']))
+
+        # snapshot info
+        snap_info = json.loads(self._get_subvolume_snapshot_info(self.volname, subvolume, snapshot))
+        for md in snap_md:
+            self.assertIn(md, snap_info, "'{0}' key not present in metadata of snapshot".format(md))
+        self.assertEqual(snap_info["has_pending_clones"], "no")
+
+        # rm --force (allowed but should fail)
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--force")
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOTEMPTY, "invalid error code on rm of subvolume with retained snapshots")
+        else:
+            self.fail("expected rm of subvolume with retained snapshots to fail")
+
+        # rm (allowed but should fail)
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOTEMPTY, "invalid error code on rm of subvolume with retained snapshots")
+        else:
+            self.fail("expected rm of subvolume with retained snapshots to fail")
+
+        ## test disallowed ops
+        # getpath
+        try:
+            self._fs_cmd("subvolume", "getpath", self.volname, subvolume)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on getpath of subvolume with retained snapshots")
+        else:
+            self.fail("expected getpath of subvolume with retained snapshots to fail")
+
+        # resize
+        nsize = self.DEFAULT_FILE_SIZE*1024*1024
+        try:
+            self._fs_cmd("subvolume", "resize", self.volname, subvolume, str(nsize))
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on resize of subvolume with retained snapshots")
+        else:
+            self.fail("expected resize of subvolume with retained snapshots to fail")
+
+        # snap-create
+        try:
+            self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, "fail")
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on snapshot create of subvolume with retained snapshots")
+        else:
+            self.fail("expected snapshot create of subvolume with retained snapshots to fail")
+
+        # remove snapshot (should remove volume)
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # verify list subvolumes returns an empty list
+        subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        self.assertEqual(len(subvolumels), 0)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_retain_snapshot_without_snapshots(self):
+        """
+        ensure retain snapshots based delete of a subvolume with no snapshots, deletes the subbvolume
+        """
+        subvolume = self._generate_random_subvolume_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # remove with snapshot retention (should remove volume, no snapshots to retain)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # verify list subvolumes returns an empty list
+        subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        self.assertEqual(len(subvolumels), 0)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_retain_snapshot_trash_busy_recreate(self):
+        """
+        ensure retained subvolume recreate fails if its trash is not yet purged
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # remove with snapshot retention
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # fake a trash entry
+        self._update_fake_trash(subvolume)
+
+        # recreate subvolume
+        try:
+            self._fs_cmd("subvolume", "create", self.volname, subvolume)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EAGAIN, "invalid error code on recreate of subvolume with purge pending")
+        else:
+            self.fail("expected recreate of subvolume with purge pending to fail")
+
+        # clear fake trash entry
+        self._update_fake_trash(subvolume, create=False)
+
+        # recreate subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_rm_with_snapshots(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # remove subvolume -- should fail with ENOTEMPTY since it has snapshots
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOTEMPTY:
+                raise RuntimeError("invalid error code returned when deleting subvolume with snapshots")
+        else:
+            raise RuntimeError("expected subvolume deletion to fail")
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_protect_unprotect_sanity(self):
+        """
+        Snapshot protect/unprotect commands are deprecated. This test exists to ensure that
+        invoking the command does not cause errors, till they are removed from a subsequent release.
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=64)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # now, protect snapshot
+        self._fs_cmd("subvolume", "snapshot", "protect", self.volname, subvolume, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # now, unprotect snapshot
+        self._fs_cmd("subvolume", "snapshot", "unprotect", self.volname, subvolume, snapshot)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_rm_force(self):
+        # test removing non existing subvolume snapshot with --force
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # remove snapshot
+        try:
+            self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot, "--force")
+        except CommandFailedError:
+            raise RuntimeError("expected the 'fs subvolume snapshot rm --force' command to succeed")
+
+    def test_subvolume_snapshot_metadata_set(self):
+        """
+        Set custom metadata for subvolume snapshot.
+        """
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, group)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group)
+
+        # set metadata for snapshot.
+        key = "key"
+        value = "value"
+        try:
+            self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume snapshot metadata set' command to succeed")
+
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_metadata_set_idempotence(self):
+        """
+        Set custom metadata for subvolume snapshot (Idempotency).
+        """
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, group)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group)
+
+        # set metadata for snapshot.
+        key = "key"
+        value = "value"
+        try:
+            self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume snapshot metadata set' command to succeed")
+
+        # set same metadata again for subvolume.
+        try:
+            self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume snapshot metadata set' command to succeed because it is idempotent operation")
+
+        # get value for specified key.
+        try:
+            ret = self._fs_cmd("subvolume", "snapshot", "metadata", "get", self.volname, subvolname, snapshot, key, group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume snapshot metadata get' command to succeed")
+
+        # remove '\n' from returned value.
+        ret = ret.strip('\n')
+
+        # match received value with expected value.
+        self.assertEqual(value, ret)
+
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_metadata_get(self):
+        """
+        Get custom metadata for a specified key in subvolume snapshot metadata.
+        """
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, group)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group)
+
+        # set metadata for snapshot.
+        key = "key"
+        value = "value"
+        self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group)
+
+        # get value for specified key.
+        try:
+            ret = self._fs_cmd("subvolume", "snapshot", "metadata", "get", self.volname, subvolname, snapshot, key, group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume snapshot metadata get' command to succeed")
+
+        # remove '\n' from returned value.
+        ret = ret.strip('\n')
+
+        # match received value with expected value.
+        self.assertEqual(value, ret)
+
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_metadata_get_for_nonexisting_key(self):
+        """
+        Get custom metadata for subvolume snapshot if specified key not exist in metadata.
+        """
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, group)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group)
+
+        # set metadata for snapshot.
+        key = "key"
+        value = "value"
+        self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group)
+
+        # try to get value for nonexisting key
+        # Expecting ENOENT exit status because key does not exist
+        try:
+            self._fs_cmd("subvolume", "snapshot", "metadata", "get", self.volname, subvolname, snapshot, "key_nonexist", group)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.ENOENT)
+        else:
+            self.fail("Expected ENOENT because 'key_nonexist' does not exist")
+
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_metadata_get_for_nonexisting_section(self):
+        """
+        Get custom metadata for subvolume snapshot if metadata is not added for subvolume snapshot.
+        """
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, group)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group)
+
+        # try to get value for nonexisting key (as section does not exist)
+        # Expecting ENOENT exit status because key does not exist
+        try:
+            self._fs_cmd("subvolume", "snapshot", "metadata", "get", self.volname, subvolname, snapshot, "key", group)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.ENOENT)
+        else:
+            self.fail("Expected ENOENT because section does not exist")
+
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_metadata_update(self):
+        """
+        Update custom metadata for a specified key in subvolume snapshot metadata.
+        """
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, group)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group)
+
+        # set metadata for snapshot.
+        key = "key"
+        value = "value"
+        self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group)
+
+        # update metadata against key.
+        new_value = "new_value"
+        self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, new_value, group)
+
+        # get metadata for specified key of snapshot.
+        try:
+            ret = self._fs_cmd("subvolume", "snapshot", "metadata", "get", self.volname, subvolname, snapshot, key, group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume snapshot metadata get' command to succeed")
+
+        # remove '\n' from returned value.
+        ret = ret.strip('\n')
+
+        # match received value with expected value.
+        self.assertEqual(new_value, ret)
+
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_metadata_list(self):
+        """
+        List custom metadata for subvolume snapshot.
+        """
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, group)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group)
+
+        # set metadata for subvolume.
+        input_metadata_dict =  {f'key_{i}' : f'value_{i}' for i in range(3)}
+
+        for k, v in input_metadata_dict.items():
+            self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, k, v, group)
+
+        # list metadata
+        try:
+            ret_dict = json.loads(self._fs_cmd("subvolume", "snapshot", "metadata", "ls", self.volname, subvolname, snapshot, group))
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume snapshot metadata ls' command to succeed")
+
+        # compare output with expected output
+        self.assertDictEqual(input_metadata_dict, ret_dict)
+
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_metadata_list_if_no_metadata_set(self):
+        """
+        List custom metadata for subvolume snapshot if metadata is not added for subvolume snapshot.
+        """
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, group)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group)
+
+        # list metadata
+        try:
+            ret_dict = json.loads(self._fs_cmd("subvolume", "snapshot", "metadata", "ls", self.volname, subvolname, snapshot, group))
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume snapshot metadata ls' command to succeed")
+
+        # compare output with expected output
+        empty_dict = {}
+        self.assertDictEqual(ret_dict, empty_dict)
+
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_metadata_remove(self):
+        """
+        Remove custom metadata for a specified key in subvolume snapshot metadata.
+        """
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, group)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group)
+
+        # set metadata for snapshot.
+        key = "key"
+        value = "value"
+        self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group)
+
+        # remove metadata against specified key.
+        try:
+            self._fs_cmd("subvolume", "snapshot", "metadata", "rm", self.volname, subvolname, snapshot, key, group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume snapshot metadata rm' command to succeed")
+
+        # confirm key is removed by again fetching metadata
+        try:
+            self._fs_cmd("subvolume", "snapshot", "metadata", "get", self.volname, subvolname, key, snapshot, group)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.ENOENT)
+        else:
+            self.fail("Expected ENOENT because key does not exist")
+
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_metadata_remove_for_nonexisting_key(self):
+        """
+        Remove custom metadata for subvolume snapshot if specified key not exist in metadata.
+        """
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, group)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group)
+
+        # set metadata for snapshot.
+        key = "key"
+        value = "value"
+        self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group)
+
+        # try to remove value for nonexisting key
+        # Expecting ENOENT exit status because key does not exist
+        try:
+            self._fs_cmd("subvolume", "snapshot", "metadata", "rm", self.volname, subvolname, snapshot, "key_nonexist", group)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.ENOENT)
+        else:
+            self.fail("Expected ENOENT because 'key_nonexist' does not exist")
+
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_metadata_remove_for_nonexisting_section(self):
+        """
+        Remove custom metadata for subvolume snapshot if metadata is not added for subvolume snapshot.
+        """
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, group)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group)
+
+        # try to remove value for nonexisting key (as section does not exist)
+        # Expecting ENOENT exit status because key does not exist
+        try:
+            self._fs_cmd("subvolume", "snapshot", "metadata", "rm", self.volname, subvolname, snapshot, "key", group)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.ENOENT)
+        else:
+            self.fail("Expected ENOENT because section does not exist")
+
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_metadata_remove_force(self):
+        """
+        Forcefully remove custom metadata for a specified key in subvolume snapshot metadata.
+        """
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, group)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group)
+
+        # set metadata for snapshot.
+        key = "key"
+        value = "value"
+        self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group)
+
+        # remove metadata against specified key with --force option.
+        try:
+            self._fs_cmd("subvolume", "snapshot", "metadata", "rm", self.volname, subvolname, snapshot, key, group, "--force")
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume snapshot metadata rm' command to succeed")
+
+        # confirm key is removed by again fetching metadata
+        try:
+            self._fs_cmd("subvolume", "snapshot", "metadata", "get", self.volname, subvolname, snapshot, key, group)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.ENOENT)
+        else:
+            self.fail("Expected ENOENT because key does not exist")
+
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_metadata_remove_force_for_nonexisting_key(self):
+        """
+        Forcefully remove custom metadata for subvolume snapshot if specified key not exist in metadata.
+        """
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, group)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group)
+
+        # set metadata for snapshot.
+        key = "key"
+        value = "value"
+        self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group)
+
+        # remove metadata against specified key.
+        try:
+            self._fs_cmd("subvolume", "snapshot", "metadata", "rm", self.volname, subvolname, snapshot, key, group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume snapshot metadata rm' command to succeed")
+
+        # confirm key is removed by again fetching metadata
+        try:
+            self._fs_cmd("subvolume", "snapshot", "metadata", "get", self.volname, subvolname, snapshot, key, group)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.ENOENT)
+        else:
+            self.fail("Expected ENOENT because key does not exist")
+
+        # again remove metadata against already removed key with --force option.
+        try:
+            self._fs_cmd("subvolume", "snapshot", "metadata", "rm", self.volname, subvolname, snapshot, key, group, "--force")
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume snapshot metadata rm' (with --force) command to succeed")
+
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_metadata_after_snapshot_remove(self):
+        """
+        Verify metadata removal of subvolume snapshot after snapshot removal.
+        """
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, group)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group)
+
+        # set metadata for snapshot.
+        key = "key"
+        value = "value"
+        self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group)
+
+        # get value for specified key.
+        ret = self._fs_cmd("subvolume", "snapshot", "metadata", "get", self.volname, subvolname, snapshot, key, group)
+
+        # remove '\n' from returned value.
+        ret = ret.strip('\n')
+
+        # match received value with expected value.
+        self.assertEqual(value, ret)
+
+        # remove subvolume snapshot.
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group)
+
+        # try to get metadata after removing snapshot.
+        # Expecting error ENOENT with error message of snapshot does not exist
+        cmd_ret = self.mgr_cluster.mon_manager.run_cluster_cmd(
+                args=["fs", "subvolume", "snapshot", "metadata", "get", self.volname, subvolname, snapshot, key, group],
+                check_status=False, stdout=StringIO(), stderr=StringIO())
+        self.assertEqual(cmd_ret.returncode, errno.ENOENT, "Expecting ENOENT error")
+        self.assertIn(f"snapshot '{snapshot}' does not exist", cmd_ret.stderr.getvalue(),
+                f"Expecting message: snapshot '{snapshot}' does not exist ")
+
+        # confirm metadata is removed by searching section name in .meta file
+        meta_path = os.path.join(".", "volumes", group, subvolname, ".meta")
+        section_name = "SNAP_METADATA_" + snapshot
+
+        try:
+            self.mount_a.run_shell(f"sudo grep {section_name} {meta_path}", omit_sudo=False)
+        except CommandFailedError as e:
+            self.assertNotEqual(e.exitstatus, 0)
+        else:
+            self.fail("Expected non-zero exist status because section should not exist")
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+
+    def test_clean_stale_subvolume_snapshot_metadata(self):
+        """
+        Validate cleaning of stale subvolume snapshot metadata.
+        """
+        subvolname = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create group.
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume in group.
+        self._fs_cmd("subvolume", "create", self.volname, subvolname, group)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group)
+
+        # set metadata for snapshot.
+        key = "key"
+        value = "value"
+        try:
+            self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group)
+        except CommandFailedError:
+            self.fail("expected the 'fs subvolume snapshot metadata set' command to succeed")
+
+        # save the subvolume config file.
+        meta_path = os.path.join(".", "volumes", group, subvolname, ".meta")
+        tmp_meta_path = os.path.join(".", "volumes", group, subvolname, ".meta.stale_snap_section")
+        self.mount_a.run_shell(['sudo', 'cp', '-p', meta_path, tmp_meta_path], omit_sudo=False)
+
+        # Delete snapshot, this would remove user snap metadata
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group)
+
+        # Copy back saved subvolume config file. This would have stale snapshot metadata
+        self.mount_a.run_shell(['sudo', 'cp', '-p', tmp_meta_path, meta_path], omit_sudo=False)
+
+        # Verify that it has stale snapshot metadata
+        section_name = "SNAP_METADATA_" + snapshot
+        try:
+            self.mount_a.run_shell(f"sudo grep {section_name} {meta_path}", omit_sudo=False)
+        except CommandFailedError:
+            self.fail("Expected grep cmd to succeed because stale snapshot metadata exist")
+
+        # Do any subvolume operation to clean the stale snapshot metadata
+        _ = json.loads(self._get_subvolume_info(self.volname, subvolname, group))
+
+        # Verify that the stale snapshot metadata is cleaned
+        try:
+            self.mount_a.run_shell(f"sudo grep {section_name} {meta_path}", omit_sudo=False)
+        except CommandFailedError as e:
+            self.assertNotEqual(e.exitstatus, 0)
+        else:
+            self.fail("Expected non-zero exist status because stale snapshot metadata should not exist")
+
+        self._fs_cmd("subvolume", "rm", self.volname, subvolname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean.
+        self._wait_for_trash_empty()
+        # Clean tmp config file
+        self.mount_a.run_shell(['sudo', 'rm', '-f', tmp_meta_path], omit_sudo=False)
+
+
+class TestSubvolumeSnapshotClones(TestVolumesHelper):
+    """ Tests for FS subvolume snapshot clone operations."""
+    def test_clone_subvolume_info(self):
+        # tests the 'fs subvolume info' command for a clone
+        subvol_md = ["atime", "bytes_pcent", "bytes_quota", "bytes_used", "created_at", "ctime",
+                     "data_pool", "gid", "mode", "mon_addrs", "mtime", "path", "pool_namespace",
+                     "type", "uid"]
+
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=1)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        subvol_info = json.loads(self._get_subvolume_info(self.volname, clone))
+        if len(subvol_info) == 0:
+            raise RuntimeError("Expected the 'fs subvolume info' command to list metadata of subvolume")
+        for md in subvol_md:
+            if md not in subvol_info.keys():
+                raise RuntimeError("%s not present in the metadata of subvolume" % md)
+        if subvol_info["type"] != "clone":
+            raise RuntimeError("type should be set to clone")
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_info_without_snapshot_clone(self):
+        """
+        Verify subvolume snapshot info output without clonig snapshot.
+        If no clone is performed then path /volumes/_index/clone/{track_id}
+        will not exist.
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create subvolume.
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # list snapshot info
+        result = json.loads(self._fs_cmd("subvolume", "snapshot", "info", self.volname, subvolume, snapshot))
+
+        # verify snapshot info
+        self.assertEqual(result['has_pending_clones'], "no")
+        self.assertFalse('orphan_clones_count' in result)
+        self.assertFalse('pending_clones' in result)
+
+        # remove snapshot, subvolume, clone
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_info_if_no_clone_pending(self):
+        """
+        Verify subvolume snapshot info output if no clone is in pending state.
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone_list =  [f'clone_{i}' for i in range(3)]
+
+        # create subvolume.
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # schedule a clones
+        for clone in clone_list:
+            self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clones status
+        for clone in clone_list:
+            self._wait_for_clone_to_complete(clone)
+
+        # list snapshot info
+        result = json.loads(self._fs_cmd("subvolume", "snapshot", "info", self.volname, subvolume, snapshot))
+
+        # verify snapshot info
+        self.assertEqual(result['has_pending_clones'], "no")
+        self.assertFalse('orphan_clones_count' in result)
+        self.assertFalse('pending_clones' in result)
+
+        # remove snapshot, subvolume, clone
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        for clone in clone_list:
+            self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_info_if_clone_pending_for_no_group(self):
+        """
+        Verify subvolume snapshot info output if clones are in pending state.
+        Clones are not specified for particular target_group. Hence target_group
+        should not be in the output as we don't show _nogroup (default group)
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone_list =  [f'clone_{i}' for i in range(3)]
+
+        # create subvolume.
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # insert delay at the beginning of snapshot clone
+        self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 5)
+
+        # schedule a clones
+        for clone in clone_list:
+            self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # list snapshot info
+        result = json.loads(self._fs_cmd("subvolume", "snapshot", "info", self.volname, subvolume, snapshot))
+
+        # verify snapshot info
+        expected_clone_list = []
+        for clone in clone_list:
+            expected_clone_list.append({"name": clone})
+        self.assertEqual(result['has_pending_clones'], "yes")
+        self.assertFalse('orphan_clones_count' in result)
+        self.assertListEqual(result['pending_clones'], expected_clone_list)
+        self.assertEqual(len(result['pending_clones']), 3)
+
+        # check clones status
+        for clone in clone_list:
+            self._wait_for_clone_to_complete(clone)
+
+        # remove snapshot, subvolume, clone
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        for clone in clone_list:
+            self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_info_if_clone_pending_for_target_group(self):
+        """
+        Verify subvolume snapshot info output if clones are in pending state.
+        Clones are not specified for target_group.
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+        group = self._generate_random_group_name()
+        target_group = self._generate_random_group_name()
+
+        # create groups
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+        self._fs_cmd("subvolumegroup", "create", self.volname, target_group)
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, group, "--mode=777")
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot, group)
+
+        # insert delay at the beginning of snapshot clone
+        self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 5)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone,
+                     "--group_name", group, "--target_group_name", target_group)
+
+        # list snapshot info
+        result = json.loads(self._fs_cmd("subvolume", "snapshot", "info", self.volname, subvolume, snapshot, "--group_name", group))
+
+        # verify snapshot info
+        expected_clone_list = [{"name": clone, "target_group": target_group}]
+        self.assertEqual(result['has_pending_clones'], "yes")
+        self.assertFalse('orphan_clones_count' in result)
+        self.assertListEqual(result['pending_clones'], expected_clone_list)
+        self.assertEqual(len(result['pending_clones']), 1)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone, clone_group=target_group)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot, group)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, group)
+        self._fs_cmd("subvolume", "rm", self.volname, clone, target_group)
+
+        # remove groups
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, target_group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_info_if_orphan_clone(self):
+        """
+        Verify subvolume snapshot info output if orphan clones exists.
+        Orphan clones should not list under pending clones.
+        orphan_clones_count should display correct count of orphan clones'
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone_list =  [f'clone_{i}' for i in range(3)]
+
+        # create subvolume.
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # insert delay at the beginning of snapshot clone
+        self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 15)
+
+        # schedule a clones
+        for clone in clone_list:
+            self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # remove track file for third clone to make it orphan
+        meta_path = os.path.join(".", "volumes", "_nogroup", subvolume, ".meta")
+        pending_clones_result = self.mount_a.run_shell(['sudo', 'grep', 'clone snaps', '-A3', meta_path], omit_sudo=False, stdout=StringIO(), stderr=StringIO())
+        third_clone_track_id = pending_clones_result.stdout.getvalue().splitlines()[3].split(" = ")[0]
+        third_clone_track_path = os.path.join(".", "volumes", "_index", "clone", third_clone_track_id)
+        self.mount_a.run_shell(f"sudo rm -f {third_clone_track_path}", omit_sudo=False)
+
+        # list snapshot info
+        result = json.loads(self._fs_cmd("subvolume", "snapshot", "info", self.volname, subvolume, snapshot))
+
+        # verify snapshot info
+        expected_clone_list = []
+        for i in range(len(clone_list)-1):
+            expected_clone_list.append({"name": clone_list[i]})
+        self.assertEqual(result['has_pending_clones'], "yes")
+        self.assertEqual(result['orphan_clones_count'], 1)
+        self.assertListEqual(result['pending_clones'], expected_clone_list)
+        self.assertEqual(len(result['pending_clones']), 2)
+
+        # check clones status
+        for i in range(len(clone_list)-1):
+            self._wait_for_clone_to_complete(clone_list[i])
+
+        # list snapshot info after cloning completion
+        res = json.loads(self._fs_cmd("subvolume", "snapshot", "info", self.volname, subvolume, snapshot))
+
+        # verify snapshot info (has_pending_clones should be no)
+        self.assertEqual(res['has_pending_clones'], "no")
+
+    def test_non_clone_status(self):
+        subvolume = self._generate_random_subvolume_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        try:
+            self._fs_cmd("clone", "status", self.volname, subvolume)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.ENOTSUP:
+                raise RuntimeError("invalid error code when fetching status of a non cloned subvolume")
+        else:
+            raise RuntimeError("expected fetching of clone status of a subvolume to fail")
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_clone_inherit_snapshot_namespace_and_size(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+        osize = self.DEFAULT_FILE_SIZE*1024*1024*12
+
+        # create subvolume, in an isolated namespace with a specified size
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--namespace-isolated", "--size", str(osize), "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=8)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # create a pool different from current subvolume pool
+        subvol_path = self._get_subvolume_path(self.volname, subvolume)
+        default_pool = self.mount_a.getfattr(subvol_path, "ceph.dir.layout.pool")
+        new_pool = "new_pool"
+        self.assertNotEqual(default_pool, new_pool)
+        self.fs.add_data_pool(new_pool)
+
+        # update source subvolume pool
+        self._do_subvolume_pool_and_namespace_update(subvolume, pool=new_pool, pool_namespace="")
+
+        # schedule a clone, with NO --pool specification
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_clone_inherit_quota_attrs(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+        osize = self.DEFAULT_FILE_SIZE*1024*1024*12
+
+        # create subvolume with a specified size
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777", "--size", str(osize))
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=8)
+
+        # get subvolume path
+        subvolpath = self._get_subvolume_path(self.volname, subvolume)
+
+        # set quota on number of files
+        self.mount_a.setfattr(subvolpath, 'ceph.quota.max_files', "20", sudo=True)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone)
+
+        # get subvolume path
+        clonepath = self._get_subvolume_path(self.volname, clone)
+
+        # verify quota max_files is inherited from source snapshot
+        subvol_quota = self.mount_a.getfattr(subvolpath, "ceph.quota.max_files")
+        clone_quota = self.mount_a.getfattr(clonepath, "ceph.quota.max_files")
+        self.assertEqual(subvol_quota, clone_quota)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_clone_in_progress_getpath(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=64)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # Insert delay at the beginning of snapshot clone
+        self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 2)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # clone should not be accessible right now
+        try:
+            self._get_subvolume_path(self.volname, clone)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EAGAIN:
+                raise RuntimeError("invalid error code when fetching path of an pending clone")
+        else:
+            raise RuntimeError("expected fetching path of an pending clone to fail")
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # clone should be accessible now
+        subvolpath = self._get_subvolume_path(self.volname, clone)
+        self.assertNotEqual(subvolpath, None)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_clone_in_progress_snapshot_rm(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=64)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # Insert delay at the beginning of snapshot clone
+        self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 2)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # snapshot should not be deletable now
+        try:
+            self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EAGAIN, msg="invalid error code when removing source snapshot of a clone")
+        else:
+            self.fail("expected removing source snapshot of a clone to fail")
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # clone should be accessible now
+        subvolpath = self._get_subvolume_path(self.volname, clone)
+        self.assertNotEqual(subvolpath, None)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_clone_in_progress_source(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=64)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # Insert delay at the beginning of snapshot clone
+        self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 2)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # verify clone source
+        result = json.loads(self._fs_cmd("clone", "status", self.volname, clone))
+        source = result['status']['source']
+        self.assertEqual(source['volume'], self.volname)
+        self.assertEqual(source['subvolume'], subvolume)
+        self.assertEqual(source.get('group', None), None)
+        self.assertEqual(source['snapshot'], snapshot)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # clone should be accessible now
+        subvolpath = self._get_subvolume_path(self.volname, clone)
+        self.assertNotEqual(subvolpath, None)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_clone_retain_snapshot_with_snapshots(self):
+        """
+        retain snapshots of a cloned subvolume and check disallowed operations
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot1, snapshot2 = self._generate_random_snapshot_name(2)
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # store path for clone verification
+        subvol1_path = self._get_subvolume_path(self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=16)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot1)
+
+        # remove with snapshot retention
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # clone retained subvolume snapshot
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot1, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot1, clone, subvol_path=subvol1_path)
+
+        # create a snapshot on the clone
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, clone, snapshot2)
+
+        # retain a clone
+        self._fs_cmd("subvolume", "rm", self.volname, clone, "--retain-snapshots")
+
+        # list snapshots
+        clonesnapshotls = json.loads(self._fs_cmd('subvolume', 'snapshot', 'ls', self.volname, clone))
+        self.assertEqual(len(clonesnapshotls), 1, "Expected the 'fs subvolume snapshot ls' command to list the"
+                         " created subvolume snapshots")
+        snapshotnames = [snapshot['name'] for snapshot in clonesnapshotls]
+        for snap in [snapshot2]:
+            self.assertIn(snap, snapshotnames, "Missing snapshot '{0}' in snapshot list".format(snap))
+
+        ## check disallowed operations on retained clone
+        # clone-status
+        try:
+            self._fs_cmd("clone", "status", self.volname, clone)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on clone status of clone with retained snapshots")
+        else:
+            self.fail("expected clone status of clone with retained snapshots to fail")
+
+        # clone-cancel
+        try:
+            self._fs_cmd("clone", "cancel", self.volname, clone)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on clone cancel of clone with retained snapshots")
+        else:
+            self.fail("expected clone cancel of clone with retained snapshots to fail")
+
+        # remove snapshots (removes subvolumes as all are in retained state)
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot1)
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, clone, snapshot2)
+
+        # verify list subvolumes returns an empty list
+        subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        self.assertEqual(len(subvolumels), 0)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_retain_snapshot_clone(self):
+        """
+        clone a snapshot from a snapshot retained subvolume
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # store path for clone verification
+        subvol_path = self._get_subvolume_path(self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=16)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # remove with snapshot retention
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # clone retained subvolume snapshot
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone, subvol_path=subvol_path)
+
+        # remove snapshots (removes retained volume)
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify list subvolumes returns an empty list
+        subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        self.assertEqual(len(subvolumels), 0)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_retain_snapshot_clone_from_newer_snapshot(self):
+        """
+        clone a subvolume from recreated subvolume's latest snapshot
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot1, snapshot2 = self._generate_random_snapshot_name(2)
+        clone = self._generate_random_clone_name(1)
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=16)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot1)
+
+        # remove with snapshot retention
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # recreate subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # get and store path for clone verification
+        subvol2_path = self._get_subvolume_path(self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=16)
+
+        # snapshot newer subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot2)
+
+        # remove with snapshot retention
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # clone retained subvolume's newer snapshot
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot2, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot2, clone, subvol_path=subvol2_path)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot1)
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot2)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify list subvolumes returns an empty list
+        subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        self.assertEqual(len(subvolumels), 0)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_retain_snapshot_recreate(self):
+        """
+        recreate a subvolume from one of its retained snapshots
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # store path for clone verification
+        subvol_path = self._get_subvolume_path(self.volname, subvolume)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=16)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # remove with snapshot retention
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+        # recreate retained subvolume using its own snapshot to clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, subvolume)
+
+        # check clone status
+        self._wait_for_clone_to_complete(subvolume)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, subvolume, subvol_path=subvol_path)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify list subvolumes returns an empty list
+        subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        self.assertEqual(len(subvolumels), 0)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_retain_snapshot_trash_busy_recreate_clone(self):
+        """
+        ensure retained clone recreate fails if its trash is not yet purged
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # clone subvolume snapshot
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # snapshot clone
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, clone, snapshot)
+
+        # remove clone with snapshot retention
+        self._fs_cmd("subvolume", "rm", self.volname, clone, "--retain-snapshots")
+
+        # fake a trash entry
+        self._update_fake_trash(clone)
+
+        # clone subvolume snapshot (recreate)
+        try:
+            self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EAGAIN, "invalid error code on recreate of clone with purge pending")
+        else:
+            self.fail("expected recreate of clone with purge pending to fail")
+
+        # clear fake trash entry
+        self._update_fake_trash(clone, create=False)
+
+        # recreate subvolume
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, clone, snapshot)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_attr_clone(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io_mixed(subvolume)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_clone_failure_status_pending_in_progress_complete(self):
+        """
+        ensure failure status is not shown when clone is not in failed/cancelled state
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone1 = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=200)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # Insert delay at the beginning of snapshot clone
+        self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 5)
+
+        # schedule a clone1
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone1)
+
+        # pending clone shouldn't show failure status
+        clone1_result = self._get_clone_status(clone1)
+        try:
+            clone1_result["status"]["failure"]["errno"]
+        except KeyError as e:
+            self.assertEqual(str(e), "'failure'")
+        else:
+            self.fail("clone status shouldn't show failure for pending clone")
+
+        # check clone1 to be in-progress
+        self._wait_for_clone_to_be_in_progress(clone1)
+
+        # in-progress clone1 shouldn't show failure status
+        clone1_result = self._get_clone_status(clone1)
+        try:
+            clone1_result["status"]["failure"]["errno"]
+        except KeyError as e:
+            self.assertEqual(str(e), "'failure'")
+        else:
+            self.fail("clone status shouldn't show failure for in-progress clone")
+
+        # wait for clone1 to complete
+        self._wait_for_clone_to_complete(clone1)
+
+        # complete clone1 shouldn't show failure status
+        clone1_result = self._get_clone_status(clone1)
+        try:
+            clone1_result["status"]["failure"]["errno"]
+        except KeyError as e:
+            self.assertEqual(str(e), "'failure'")
+        else:
+            self.fail("clone status shouldn't show failure for complete clone")
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone1)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_clone_failure_status_failed(self):
+        """
+        ensure failure status is shown when clone is in failed state and validate the reason
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone1 = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=200)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # Insert delay at the beginning of snapshot clone
+        self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 5)
+
+        # schedule a clone1
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone1)
+
+        # remove snapshot from backend to force the clone failure.
+        snappath = os.path.join(".", "volumes", "_nogroup", subvolume, ".snap", snapshot)
+        self.mount_a.run_shell(['rmdir', snappath], sudo=True)
+
+        # wait for clone1 to fail.
+        self._wait_for_clone_to_fail(clone1)
+
+        # check clone1 status
+        clone1_result = self._get_clone_status(clone1)
+        self.assertEqual(clone1_result["status"]["state"], "failed")
+        self.assertEqual(clone1_result["status"]["failure"]["errno"], "2")
+        self.assertEqual(clone1_result["status"]["failure"]["error_msg"], "snapshot '{0}' does not exist".format(snapshot))
+
+        # clone removal should succeed after failure, remove clone1
+        self._fs_cmd("subvolume", "rm", self.volname, clone1, "--force")
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_clone_failure_status_pending_cancelled(self):
+        """
+        ensure failure status is shown when clone is cancelled during pending state and validate the reason
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone1 = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=200)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # Insert delay at the beginning of snapshot clone
+        self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 5)
+
+        # schedule a clone1
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone1)
+
+        # cancel pending clone1
+        self._fs_cmd("clone", "cancel", self.volname, clone1)
+
+        # check clone1 status
+        clone1_result = self._get_clone_status(clone1)
+        self.assertEqual(clone1_result["status"]["state"], "canceled")
+        self.assertEqual(clone1_result["status"]["failure"]["errno"], "4")
+        self.assertEqual(clone1_result["status"]["failure"]["error_msg"], "user interrupted clone operation")
+
+        # clone removal should succeed with force after cancelled, remove clone1
+        self._fs_cmd("subvolume", "rm", self.volname, clone1, "--force")
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_clone_failure_status_in_progress_cancelled(self):
+        """
+        ensure failure status is shown when clone is cancelled during in-progress state and validate the reason
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone1 = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=200)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # Insert delay at the beginning of snapshot clone
+        self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 5)
+
+        # schedule a clone1
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone1)
+
+        # wait for clone1 to be in-progress
+        self._wait_for_clone_to_be_in_progress(clone1)
+
+        # cancel in-progess clone1
+        self._fs_cmd("clone", "cancel", self.volname, clone1)
+
+        # check clone1 status
+        clone1_result = self._get_clone_status(clone1)
+        self.assertEqual(clone1_result["status"]["state"], "canceled")
+        self.assertEqual(clone1_result["status"]["failure"]["errno"], "4")
+        self.assertEqual(clone1_result["status"]["failure"]["error_msg"], "user interrupted clone operation")
+
+        # clone removal should succeed with force after cancelled, remove clone1
+        self._fs_cmd("subvolume", "rm", self.volname, clone1, "--force")
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=64)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone_quota_exceeded(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume with 20MB quota
+        osize = self.DEFAULT_FILE_SIZE*1024*1024*20
+        self._fs_cmd("subvolume", "create", self.volname, subvolume,"--mode=777", "--size", str(osize))
+
+        # do IO, write 50 files of 1MB each to exceed quota. This mostly succeeds as quota enforcement takes time.
+        try:
+            self._do_subvolume_io(subvolume, number_of_files=50)
+        except CommandFailedError:
+            # ignore quota enforcement error.
+            pass
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_in_complete_clone_rm(self):
+        """
+        Validates the removal of clone when it is not in 'complete|cancelled|failed' state.
+        The forceful removl of subvolume clone succeeds only if it's in any of the
+        'complete|cancelled|failed' states. It fails with EAGAIN in any other states.
+        """
+
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=64)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # Insert delay at the beginning of snapshot clone
+        self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 2)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # Use --force since clone is not complete. Returns EAGAIN as clone is not either complete or cancelled.
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, clone, "--force")
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EAGAIN:
+                raise RuntimeError("invalid error code when trying to remove failed clone")
+        else:
+            raise RuntimeError("expected error when removing a failed clone")
+
+        # cancel on-going clone
+        self._fs_cmd("clone", "cancel", self.volname, clone)
+
+        # verify canceled state
+        self._check_clone_canceled(clone)
+
+        # clone removal should succeed after cancel
+        self._fs_cmd("subvolume", "rm", self.volname, clone, "--force")
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone_retain_suid_guid(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # Create a file with suid, guid bits set along with executable bit.
+        args = ["subvolume", "getpath", self.volname, subvolume]
+        args = tuple(args)
+        subvolpath = self._fs_cmd(*args)
+        self.assertNotEqual(subvolpath, None)
+        subvolpath = subvolpath[1:].rstrip() # remove "/" prefix and any trailing newline
+
+        file_path = subvolpath
+        file_path = os.path.join(subvolpath, "test_suid_file")
+        self.mount_a.run_shell(["touch", file_path])
+        self.mount_a.run_shell(["chmod", "u+sx,g+sx", file_path])
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone_and_reclone(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone1, clone2 = self._generate_random_clone_name(2)
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=32)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone1)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone1)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone1)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # now the clone is just like a normal subvolume -- snapshot the clone and fork
+        # another clone. before that do some IO so it's can be differentiated.
+        self._do_subvolume_io(clone1, create_dir="data", number_of_files=32)
+
+        # snapshot clone -- use same snap name
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, clone1, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, clone1, snapshot, clone2)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone2)
+
+        # verify clone
+        self._verify_clone(clone1, snapshot, clone2)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, clone1, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone1)
+        self._fs_cmd("subvolume", "rm", self.volname, clone2)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone_cancel_in_progress(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=128)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # Insert delay at the beginning of snapshot clone
+        self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 2)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # cancel on-going clone
+        self._fs_cmd("clone", "cancel", self.volname, clone)
+
+        # verify canceled state
+        self._check_clone_canceled(clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone, "--force")
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone_cancel_pending(self):
+        """
+        this test is a bit more involved compared to canceling an in-progress clone.
+        we'd need to ensure that a to-be canceled clone has still not been picked up
+        by cloner threads. exploit the fact that clones are picked up in an FCFS
+        fashion and there are four (4) cloner threads by default. When the number of
+        cloner threads increase, this test _may_ start tripping -- so, the number of
+        clone operations would need to be jacked up.
+        """
+        # default number of clone threads
+        NR_THREADS = 4
+        # good enough for 4 threads
+        NR_CLONES = 5
+        # yeh, 1gig -- we need the clone to run for sometime
+        FILE_SIZE_MB = 1024
+
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clones = self._generate_random_clone_name(NR_CLONES)
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=4, file_size=FILE_SIZE_MB)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # schedule clones
+        for clone in clones:
+            self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        to_wait = clones[0:NR_THREADS]
+        to_cancel = clones[NR_THREADS:]
+
+        # cancel pending clones and verify
+        for clone in to_cancel:
+            status = json.loads(self._fs_cmd("clone", "status", self.volname, clone))
+            self.assertEqual(status["status"]["state"], "pending")
+            self._fs_cmd("clone", "cancel", self.volname, clone)
+            self._check_clone_canceled(clone)
+
+        # let's cancel on-going clones. handle the case where some of the clones
+        # _just_ complete
+        for clone in list(to_wait):
+            try:
+                self._fs_cmd("clone", "cancel", self.volname, clone)
+                to_cancel.append(clone)
+                to_wait.remove(clone)
+            except CommandFailedError as ce:
+                if ce.exitstatus != errno.EINVAL:
+                    raise RuntimeError("invalid error code when cancelling on-going clone")
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        for clone in to_wait:
+            self._fs_cmd("subvolume", "rm", self.volname, clone)
+        for clone in to_cancel:
+            self._fs_cmd("subvolume", "rm", self.volname, clone, "--force")
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone_different_groups(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+        s_group, c_group = self._generate_random_group_name(2)
+
+        # create groups
+        self._fs_cmd("subvolumegroup", "create", self.volname, s_group)
+        self._fs_cmd("subvolumegroup", "create", self.volname, c_group)
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, s_group, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, subvolume_group=s_group, number_of_files=32)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot, s_group)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone,
+                     '--group_name', s_group, '--target_group_name', c_group)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone, clone_group=c_group)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone, source_group=s_group, clone_group=c_group)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot, s_group)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, s_group)
+        self._fs_cmd("subvolume", "rm", self.volname, clone, c_group)
+
+        # remove groups
+        self._fs_cmd("subvolumegroup", "rm", self.volname, s_group)
+        self._fs_cmd("subvolumegroup", "rm", self.volname, c_group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone_fail_with_remove(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone1, clone2 = self._generate_random_clone_name(2)
+
+        pool_capacity = 32 * 1024 * 1024
+        # number of files required to fill up 99% of the pool
+        nr_files = int((pool_capacity * 0.99) / (TestVolumes.DEFAULT_FILE_SIZE * 1024 * 1024))
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=nr_files)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # add data pool
+        new_pool = "new_pool"
+        self.fs.add_data_pool(new_pool)
+
+        self.fs.mon_manager.raw_cluster_cmd("osd", "pool", "set-quota", new_pool,
+                                            "max_bytes", "{0}".format(pool_capacity // 4))
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone1, "--pool_layout", new_pool)
+
+        # check clone status -- this should dramatically overshoot the pool quota
+        self._wait_for_clone_to_complete(clone1)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone1, clone_pool=new_pool)
+
+        # wait a bit so that subsequent I/O will give pool full error
+        time.sleep(120)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone2, "--pool_layout", new_pool)
+
+        # check clone status
+        self._wait_for_clone_to_fail(clone2)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone1)
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, clone2)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EAGAIN:
+                raise RuntimeError("invalid error code when trying to remove failed clone")
+        else:
+            raise RuntimeError("expected error when removing a failed clone")
+
+        #  ... and with force, failed clone can be removed
+        self._fs_cmd("subvolume", "rm", self.volname, clone2, "--force")
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone_on_existing_subvolumes(self):
+        subvolume1, subvolume2 = self._generate_random_subvolume_name(2)
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolumes
+        self._fs_cmd("subvolume", "create", self.volname, subvolume1, "--mode=777")
+        self._fs_cmd("subvolume", "create", self.volname, subvolume2, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume1, number_of_files=32)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume1, snapshot)
+
+        # schedule a clone with target as subvolume2
+        try:
+            self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume1, snapshot, subvolume2)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EEXIST:
+                raise RuntimeError("invalid error code when cloning to existing subvolume")
+        else:
+            raise RuntimeError("expected cloning to fail if the target is an existing subvolume")
+
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume1, snapshot, clone)
+
+        # schedule a clone with target as clone
+        try:
+            self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume1, snapshot, clone)
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EEXIST:
+                raise RuntimeError("invalid error code when cloning to existing clone")
+        else:
+            raise RuntimeError("expected cloning to fail if the target is an existing clone")
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume1, snapshot, clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume1, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume1)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume2)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone_pool_layout(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # add data pool
+        new_pool = "new_pool"
+        newid = self.fs.add_data_pool(new_pool)
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=32)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone, "--pool_layout", new_pool)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone, clone_pool=new_pool)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        subvol_path = self._get_subvolume_path(self.volname, clone)
+        desired_pool = self.mount_a.getfattr(subvol_path, "ceph.dir.layout.pool")
+        try:
+            self.assertEqual(desired_pool, new_pool)
+        except AssertionError:
+            self.assertEqual(int(desired_pool), newid) # old kernel returns id
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone_under_group(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+        group = self._generate_random_group_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=32)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone, '--target_group_name', group)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone, clone_group=group)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone, clone_group=group)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone, group)
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone_with_attrs(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        mode = "777"
+        uid  = "1000"
+        gid  = "1000"
+        new_uid  = "1001"
+        new_gid  = "1001"
+        new_mode = "700"
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode", mode, "--uid", uid, "--gid", gid)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=32)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # change subvolume attrs (to ensure clone picks up snapshot attrs)
+        self._do_subvolume_attr_update(subvolume, new_uid, new_gid, new_mode)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_clone_with_upgrade(self):
+        """
+        yet another poor man's upgrade test -- rather than going through a full
+        upgrade cycle, emulate old types subvolumes by going through the wormhole
+        and verify clone operation.
+        further ensure that a legacy volume is not updated to v2, but clone is.
+        """
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # emulate a old-fashioned subvolume
+        createpath = os.path.join(".", "volumes", "_nogroup", subvolume)
+        self.mount_a.run_shell_payload(f"mkdir -p -m 777 {createpath}", sudo=True)
+
+        # add required xattrs to subvolume
+        default_pool = self.mount_a.getfattr(".", "ceph.dir.layout.pool")
+        self.mount_a.setfattr(createpath, 'ceph.dir.layout.pool', default_pool, sudo=True)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=64)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # ensure metadata file is in legacy location, with required version v1
+        self._assert_meta_location_and_version(self.volname, subvolume, version=1, legacy=True)
+
+        # Insert delay at the beginning of snapshot clone
+        self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 2)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # snapshot should not be deletable now
+        try:
+            self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EAGAIN, msg="invalid error code when removing source snapshot of a clone")
+        else:
+            self.fail("expected removing source snapshot of a clone to fail")
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone, source_version=1)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # ensure metadata file is in v2 location, with required version v2
+        self._assert_meta_location_and_version(self.volname, clone)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_snapshot_reconf_max_concurrent_clones(self):
+        """
+        Validate 'max_concurrent_clones' config option
+        """
+
+        # get the default number of cloner threads
+        default_max_concurrent_clones = int(self.config_get('mgr', 'mgr/volumes/max_concurrent_clones'))
+        self.assertEqual(default_max_concurrent_clones, 4)
+
+        # Increase number of cloner threads
+        self.config_set('mgr', 'mgr/volumes/max_concurrent_clones', 6)
+        max_concurrent_clones = int(self.config_get('mgr', 'mgr/volumes/max_concurrent_clones'))
+        self.assertEqual(max_concurrent_clones, 6)
+
+        # Decrease number of cloner threads
+        self.config_set('mgr', 'mgr/volumes/max_concurrent_clones', 2)
+        max_concurrent_clones = int(self.config_get('mgr', 'mgr/volumes/max_concurrent_clones'))
+        self.assertEqual(max_concurrent_clones, 2)
+
+    def test_subvolume_snapshot_config_snapshot_clone_delay(self):
+        """
+        Validate 'snapshot_clone_delay' config option
+        """
+
+        # get the default delay before starting the clone
+        default_timeout = int(self.config_get('mgr', 'mgr/volumes/snapshot_clone_delay'))
+        self.assertEqual(default_timeout, 0)
+
+        # Insert delay of 2 seconds at the beginning of the snapshot clone
+        self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 2)
+        default_timeout = int(self.config_get('mgr', 'mgr/volumes/snapshot_clone_delay'))
+        self.assertEqual(default_timeout, 2)
+
+        # Decrease number of cloner threads
+        self.config_set('mgr', 'mgr/volumes/max_concurrent_clones', 2)
+        max_concurrent_clones = int(self.config_get('mgr', 'mgr/volumes/max_concurrent_clones'))
+        self.assertEqual(max_concurrent_clones, 2)
+
+    def test_subvolume_under_group_snapshot_clone(self):
+        subvolume = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create group
+        self._fs_cmd("subvolumegroup", "create", self.volname, group)
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, group, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, subvolume_group=group, number_of_files=32)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot, group)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone, '--group_name', group)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone, source_group=group)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot, group)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume, group)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+
+class TestMisc(TestVolumesHelper):
+    """Miscellaneous tests related to FS volume, subvolume group, and subvolume operations."""
+    def test_connection_expiration(self):
+        # unmount any cephfs mounts
+        for i in range(0, self.CLIENTS_REQUIRED):
+            self.mounts[i].umount_wait()
+        sessions = self._session_list()
+        self.assertLessEqual(len(sessions), 1) # maybe mgr is already mounted
+
+        # Get the mgr to definitely mount cephfs
+        subvolume = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+        sessions = self._session_list()
+        self.assertEqual(len(sessions), 1)
+
+        # Now wait for the mgr to expire the connection:
+        self.wait_until_evicted(sessions[0]['id'], timeout=90)
+
+    def test_mgr_eviction(self):
+        # unmount any cephfs mounts
+        for i in range(0, self.CLIENTS_REQUIRED):
+            self.mounts[i].umount_wait()
+        sessions = self._session_list()
+        self.assertLessEqual(len(sessions), 1) # maybe mgr is already mounted
+
+        # Get the mgr to definitely mount cephfs
+        subvolume = self._generate_random_subvolume_name()
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+        sessions = self._session_list()
+        self.assertEqual(len(sessions), 1)
+
+        # Now fail the mgr, check the session was evicted
+        mgr = self.mgr_cluster.get_active_id()
+        self.mgr_cluster.mgr_fail(mgr)
+        self.wait_until_evicted(sessions[0]['id'])
+
+    def test_names_can_only_be_goodchars(self):
+        """
+        Test the creating vols, subvols subvolgroups fails when their names uses
+        characters beyond [a-zA-Z0-9 -_.].
+        """
+        volname, badname = 'testvol', 'abcd@#'
+
+        with self.assertRaises(CommandFailedError):
+            self._fs_cmd('volume', 'create', badname)
+        self._fs_cmd('volume', 'create', volname)
+
+        with self.assertRaises(CommandFailedError):
+            self._fs_cmd('subvolumegroup', 'create', volname, badname)
+
+        with self.assertRaises(CommandFailedError):
+            self._fs_cmd('subvolume', 'create', volname, badname)
+        self._fs_cmd('volume', 'rm', volname, '--yes-i-really-mean-it')
+
+    def test_subvolume_ops_on_nonexistent_vol(self):
+        # tests the fs subvolume operations on non existing volume
+
+        volname = "non_existent_subvolume"
+
+        # try subvolume operations
+        for op in ("create", "rm", "getpath", "info", "resize", "pin", "ls"):
+            try:
+                if op == "resize":
+                    self._fs_cmd("subvolume", "resize", volname, "subvolname_1", "inf")
+                elif op == "pin":
+                    self._fs_cmd("subvolume", "pin", volname, "subvolname_1", "export", "1")
+                elif op == "ls":
+                    self._fs_cmd("subvolume", "ls", volname)
+                else:
+                    self._fs_cmd("subvolume", op, volname, "subvolume_1")
+            except CommandFailedError as ce:
+                self.assertEqual(ce.exitstatus, errno.ENOENT)
+            else:
+                self.fail("expected the 'fs subvolume {0}' command to fail".format(op))
+
+        # try subvolume snapshot operations and clone create
+        for op in ("create", "rm", "info", "protect", "unprotect", "ls", "clone"):
+            try:
+                if op == "ls":
+                    self._fs_cmd("subvolume", "snapshot", op, volname, "subvolume_1")
+                elif op == "clone":
+                    self._fs_cmd("subvolume", "snapshot", op, volname, "subvolume_1", "snapshot_1", "clone_1")
+                else:
+                    self._fs_cmd("subvolume", "snapshot", op, volname, "subvolume_1", "snapshot_1")
+            except CommandFailedError as ce:
+                self.assertEqual(ce.exitstatus, errno.ENOENT)
+            else:
+                self.fail("expected the 'fs subvolume snapshot {0}' command to fail".format(op))
+
+        # try, clone status
+        try:
+            self._fs_cmd("clone", "status", volname, "clone_1")
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.ENOENT)
+        else:
+            self.fail("expected the 'fs clone status' command to fail")
+
+        # try subvolumegroup operations
+        for op in ("create", "rm", "getpath", "pin", "ls"):
+            try:
+                if op == "pin":
+                    self._fs_cmd("subvolumegroup", "pin", volname, "group_1", "export", "0")
+                elif op == "ls":
+                    self._fs_cmd("subvolumegroup", op, volname)
+                else:
+                    self._fs_cmd("subvolumegroup", op, volname, "group_1")
+            except CommandFailedError as ce:
+                self.assertEqual(ce.exitstatus, errno.ENOENT)
+            else:
+                self.fail("expected the 'fs subvolumegroup {0}' command to fail".format(op))
+
+        # try subvolumegroup snapshot operations
+        for op in ("create", "rm", "ls"):
+            try:
+                if op == "ls":
+                    self._fs_cmd("subvolumegroup", "snapshot", op, volname, "group_1")
+                else:
+                    self._fs_cmd("subvolumegroup", "snapshot", op, volname, "group_1", "snapshot_1")
+            except CommandFailedError as ce:
+                self.assertEqual(ce.exitstatus, errno.ENOENT)
+            else:
+                self.fail("expected the 'fs subvolumegroup snapshot {0}' command to fail".format(op))
+
+    def test_subvolume_upgrade_legacy_to_v1(self):
+        """
+        poor man's upgrade test -- rather than going through a full upgrade cycle,
+        emulate subvolumes by going through the wormhole and verify if they are
+        accessible.
+        further ensure that a legacy volume is not updated to v2.
+        """
+        subvolume1, subvolume2 = self._generate_random_subvolume_name(2)
+        group = self._generate_random_group_name()
+
+        # emulate a old-fashioned subvolume -- one in the default group and
+        # the other in a custom group
+        createpath1 = os.path.join(".", "volumes", "_nogroup", subvolume1)
+        self.mount_a.run_shell(['mkdir', '-p', createpath1], sudo=True)
+
+        # create group
+        createpath2 = os.path.join(".", "volumes", group, subvolume2)
+        self.mount_a.run_shell(['mkdir', '-p', createpath2], sudo=True)
+
+        # this would auto-upgrade on access without anyone noticing
+        subvolpath1 = self._fs_cmd("subvolume", "getpath", self.volname, subvolume1)
+        self.assertNotEqual(subvolpath1, None)
+        subvolpath1 = subvolpath1.rstrip() # remove "/" prefix and any trailing newline
+
+        subvolpath2 = self._fs_cmd("subvolume", "getpath", self.volname, subvolume2, group)
+        self.assertNotEqual(subvolpath2, None)
+        subvolpath2 = subvolpath2.rstrip() # remove "/" prefix and any trailing newline
+
+        # and... the subvolume path returned should be what we created behind the scene
+        self.assertEqual(createpath1[1:], subvolpath1)
+        self.assertEqual(createpath2[1:], subvolpath2)
+
+        # ensure metadata file is in legacy location, with required version v1
+        self._assert_meta_location_and_version(self.volname, subvolume1, version=1, legacy=True)
+        self._assert_meta_location_and_version(self.volname, subvolume2, subvol_group=group, version=1, legacy=True)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume1)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume2, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_subvolume_no_upgrade_v1_sanity(self):
+        """
+        poor man's upgrade test -- theme continues...
+
+        This test is to ensure v1 subvolumes are retained as is, due to a snapshot being present, and runs through
+        a series of operations on the v1 subvolume to ensure they work as expected.
+        """
+        subvol_md = ["atime", "bytes_pcent", "bytes_quota", "bytes_used", "created_at", "ctime",
+                     "data_pool", "gid", "mode", "mon_addrs", "mtime", "path", "pool_namespace",
+                     "type", "uid", "features", "state"]
+        snap_md = ["created_at", "data_pool", "has_pending_clones"]
+
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone1, clone2 = self._generate_random_clone_name(2)
+        mode = "777"
+        uid  = "1000"
+        gid  = "1000"
+
+        # emulate a v1 subvolume -- in the default group
+        subvolume_path = self._create_v1_subvolume(subvolume)
+
+        # getpath
+        subvolpath = self._get_subvolume_path(self.volname, subvolume)
+        self.assertEqual(subvolpath, subvolume_path)
+
+        # ls
+        subvolumes = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        self.assertEqual(len(subvolumes), 1, "subvolume ls count mismatch, expected '1', found {0}".format(len(subvolumes)))
+        self.assertEqual(subvolumes[0]['name'], subvolume,
+                         "subvolume name mismatch in ls output, expected '{0}', found '{1}'".format(subvolume, subvolumes[0]['name']))
+
+        # info
+        subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume))
+        for md in subvol_md:
+            self.assertIn(md, subvol_info, "'{0}' key not present in metadata of subvolume".format(md))
+
+        self.assertEqual(subvol_info["state"], "complete",
+                         msg="expected state to be 'complete', found '{0}".format(subvol_info["state"]))
+        self.assertEqual(len(subvol_info["features"]), 2,
+                         msg="expected 1 feature, found '{0}' ({1})".format(len(subvol_info["features"]), subvol_info["features"]))
+        for feature in ['snapshot-clone', 'snapshot-autoprotect']:
+            self.assertIn(feature, subvol_info["features"], msg="expected feature '{0}' in subvolume".format(feature))
+
+        # resize
+        nsize = self.DEFAULT_FILE_SIZE*1024*1024*10
+        self._fs_cmd("subvolume", "resize", self.volname, subvolume, str(nsize))
+        subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume))
+        for md in subvol_md:
+            self.assertIn(md, subvol_info, "'{0}' key not present in metadata of subvolume".format(md))
+        self.assertEqual(subvol_info["bytes_quota"], nsize, "bytes_quota should be set to '{0}'".format(nsize))
+
+        # create (idempotent) (change some attrs, to ensure attrs are preserved from the snapshot on clone)
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode", mode, "--uid", uid, "--gid", gid)
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=8)
+
+        # snap-create
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone1)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone1)
+
+        # ensure clone is v2
+        self._assert_meta_location_and_version(self.volname, clone1, version=2)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone1, source_version=1)
+
+        # clone (older snapshot)
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, 'fake', clone2)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone2)
+
+        # ensure clone is v2
+        self._assert_meta_location_and_version(self.volname, clone2, version=2)
+
+        # verify clone
+        # TODO: rentries will mismatch till this is fixed https://tracker.ceph.com/issues/46747
+        #self._verify_clone(subvolume, 'fake', clone2, source_version=1)
+
+        # snap-info
+        snap_info = json.loads(self._get_subvolume_snapshot_info(self.volname, subvolume, snapshot))
+        for md in snap_md:
+            self.assertIn(md, snap_info, "'{0}' key not present in metadata of snapshot".format(md))
+        self.assertEqual(snap_info["has_pending_clones"], "no")
+
+        # snap-ls
+        subvol_snapshots = json.loads(self._fs_cmd('subvolume', 'snapshot', 'ls', self.volname, subvolume))
+        self.assertEqual(len(subvol_snapshots), 2, "subvolume ls count mismatch, expected 2', found {0}".format(len(subvol_snapshots)))
+        snapshotnames = [snapshot['name'] for snapshot in subvol_snapshots]
+        for name in [snapshot, 'fake']:
+            self.assertIn(name, snapshotnames, msg="expected snapshot '{0}' in subvolume snapshot ls".format(name))
+
+        # snap-rm
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, "fake")
+
+        # ensure volume is still at version 1
+        self._assert_meta_location_and_version(self.volname, subvolume, version=1)
+
+        # rm
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone1)
+        self._fs_cmd("subvolume", "rm", self.volname, clone2)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_no_upgrade_v1_to_v2(self):
+        """
+        poor man's upgrade test -- theme continues...
+        ensure v1 to v2 upgrades are not done automatically due to various states of v1
+        """
+        subvolume1, subvolume2, subvolume3 = self._generate_random_subvolume_name(3)
+        group = self._generate_random_group_name()
+
+        # emulate a v1 subvolume -- in the default group
+        subvol1_path = self._create_v1_subvolume(subvolume1)
+
+        # emulate a v1 subvolume -- in a custom group
+        subvol2_path = self._create_v1_subvolume(subvolume2, subvol_group=group)
+
+        # emulate a v1 subvolume -- in a clone pending state
+        self._create_v1_subvolume(subvolume3, subvol_type='clone', has_snapshot=False, state='pending')
+
+        # this would attempt auto-upgrade on access, but fail to do so as snapshots exist
+        subvolpath1 = self._get_subvolume_path(self.volname, subvolume1)
+        self.assertEqual(subvolpath1, subvol1_path)
+
+        subvolpath2 = self._get_subvolume_path(self.volname, subvolume2, group_name=group)
+        self.assertEqual(subvolpath2, subvol2_path)
+
+        # this would attempt auto-upgrade on access, but fail to do so as volume is not complete
+        # use clone status, as only certain operations are allowed in pending state
+        status = json.loads(self._fs_cmd("clone", "status", self.volname, subvolume3))
+        self.assertEqual(status["status"]["state"], "pending")
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume1, "fake")
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume2, "fake", group)
+
+        # ensure metadata file is in v1 location, with version retained as v1
+        self._assert_meta_location_and_version(self.volname, subvolume1, version=1)
+        self._assert_meta_location_and_version(self.volname, subvolume2, subvol_group=group, version=1)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume1)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume2, group)
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, subvolume3)
+        except CommandFailedError as ce:
+            self.assertEqual(ce.exitstatus, errno.EAGAIN, "invalid error code on rm of subvolume undergoing clone")
+        else:
+            self.fail("expected rm of subvolume undergoing clone to fail")
+
+        # ensure metadata file is in v1 location, with version retained as v1
+        self._assert_meta_location_and_version(self.volname, subvolume3, version=1)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume3, "--force")
+
+        # verify list subvolumes returns an empty list
+        subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname))
+        self.assertEqual(len(subvolumels), 0)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_subvolume_upgrade_v1_to_v2(self):
+        """
+        poor man's upgrade test -- theme continues...
+        ensure v1 to v2 upgrades work
+        """
+        subvolume1, subvolume2 = self._generate_random_subvolume_name(2)
+        group = self._generate_random_group_name()
+
+        # emulate a v1 subvolume -- in the default group
+        subvol1_path = self._create_v1_subvolume(subvolume1, has_snapshot=False)
+
+        # emulate a v1 subvolume -- in a custom group
+        subvol2_path = self._create_v1_subvolume(subvolume2, subvol_group=group, has_snapshot=False)
+
+        # this would attempt auto-upgrade on access
+        subvolpath1 = self._get_subvolume_path(self.volname, subvolume1)
+        self.assertEqual(subvolpath1, subvol1_path)
+
+        subvolpath2 = self._get_subvolume_path(self.volname, subvolume2, group_name=group)
+        self.assertEqual(subvolpath2, subvol2_path)
+
+        # ensure metadata file is in v2 location, with version retained as v2
+        self._assert_meta_location_and_version(self.volname, subvolume1, version=2)
+        self._assert_meta_location_and_version(self.volname, subvolume2, subvol_group=group, version=2)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume1)
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume2, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_malicious_metafile_on_legacy_to_v1_upgrade(self):
+        """
+        Validate handcrafted .meta file on legacy subvol root doesn't break the system
+        on legacy subvol upgrade to v1
+        poor man's upgrade test -- theme continues...
+        """
+        subvol1, subvol2 = self._generate_random_subvolume_name(2)
+
+        # emulate a old-fashioned subvolume in the default group
+        createpath1 = os.path.join(".", "volumes", "_nogroup", subvol1)
+        self.mount_a.run_shell(['sudo', 'mkdir', '-p', createpath1], omit_sudo=False)
+
+        # add required xattrs to subvolume
+        default_pool = self.mount_a.getfattr(".", "ceph.dir.layout.pool")
+        self.mount_a.setfattr(createpath1, 'ceph.dir.layout.pool', default_pool, sudo=True)
+
+        # create v2 subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvol2)
+
+        # Create malicious .meta file in legacy subvolume root. Copy v2 subvolume
+        # .meta into legacy subvol1's root
+        subvol2_metapath = os.path.join(".", "volumes", "_nogroup", subvol2, ".meta")
+        self.mount_a.run_shell(['sudo', 'cp', subvol2_metapath, createpath1], omit_sudo=False)
+
+        # Upgrade legacy subvol1 to v1
+        subvolpath1 = self._fs_cmd("subvolume", "getpath", self.volname, subvol1)
+        self.assertNotEqual(subvolpath1, None)
+        subvolpath1 = subvolpath1.rstrip()
+
+        # the subvolume path returned should not be of subvol2 from handcrafted
+        # .meta file
+        self.assertEqual(createpath1[1:], subvolpath1)
+
+        # ensure metadata file is in legacy location, with required version v1
+        self._assert_meta_location_and_version(self.volname, subvol1, version=1, legacy=True)
+
+        # Authorize alice authID read-write access to subvol1. Verify it authorizes subvol1 path and not subvol2
+        # path whose '.meta' file is copied to subvol1 root
+        authid1 = "alice"
+        self._fs_cmd("subvolume", "authorize", self.volname, subvol1, authid1)
+
+        # Validate that the mds path added is of subvol1 and not of subvol2
+        out = json.loads(self.fs.mon_manager.raw_cluster_cmd("auth", "get", "client.alice", "--format=json-pretty"))
+        self.assertEqual("client.alice", out[0]["entity"])
+        self.assertEqual("allow rw path={0}".format(createpath1[1:]), out[0]["caps"]["mds"])
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvol1)
+        self._fs_cmd("subvolume", "rm", self.volname, subvol2)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+    def test_binary_metafile_on_legacy_to_v1_upgrade(self):
+        """
+        Validate binary .meta file on legacy subvol root doesn't break the system
+        on legacy subvol upgrade to v1
+        poor man's upgrade test -- theme continues...
+        """
+        subvol = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # emulate a old-fashioned subvolume -- in a custom group
+        createpath = os.path.join(".", "volumes", group, subvol)
+        self.mount_a.run_shell(['sudo', 'mkdir', '-p', createpath], omit_sudo=False)
+
+        # add required xattrs to subvolume
+        default_pool = self.mount_a.getfattr(".", "ceph.dir.layout.pool")
+        self.mount_a.setfattr(createpath, 'ceph.dir.layout.pool', default_pool, sudo=True)
+
+        # Create unparseable binary .meta file on legacy subvol's root
+        meta_contents = os.urandom(4096)
+        meta_filepath = os.path.join(self.mount_a.mountpoint, createpath, ".meta")
+        self.mount_a.client_remote.write_file(meta_filepath, meta_contents, sudo=True)
+
+        # Upgrade legacy subvol to v1
+        subvolpath = self._fs_cmd("subvolume", "getpath", self.volname, subvol, group)
+        self.assertNotEqual(subvolpath, None)
+        subvolpath = subvolpath.rstrip()
+
+        # The legacy subvolume path should be returned for subvol.
+        # Should ignore unparseable binary .meta file in subvol's root
+        self.assertEqual(createpath[1:], subvolpath)
+
+        # ensure metadata file is in legacy location, with required version v1
+        self._assert_meta_location_and_version(self.volname, subvol, subvol_group=group, version=1, legacy=True)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvol, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
+
+    def test_unparseable_metafile_on_legacy_to_v1_upgrade(self):
+        """
+        Validate unparseable text .meta file on legacy subvol root doesn't break the system
+        on legacy subvol upgrade to v1
+        poor man's upgrade test -- theme continues...
+        """
+        subvol = self._generate_random_subvolume_name()
+        group = self._generate_random_group_name()
+
+        # emulate a old-fashioned subvolume -- in a custom group
+        createpath = os.path.join(".", "volumes", group, subvol)
+        self.mount_a.run_shell(['sudo', 'mkdir', '-p', createpath], omit_sudo=False)
+
+        # add required xattrs to subvolume
+        default_pool = self.mount_a.getfattr(".", "ceph.dir.layout.pool")
+        self.mount_a.setfattr(createpath, 'ceph.dir.layout.pool', default_pool, sudo=True)
+
+        # Create unparseable text .meta file on legacy subvol's root
+        meta_contents = "unparseable config\nfile ...\nunparseable config\nfile ...\n"
+        meta_filepath = os.path.join(self.mount_a.mountpoint, createpath, ".meta")
+        self.mount_a.client_remote.write_file(meta_filepath, meta_contents, sudo=True)
+
+        # Upgrade legacy subvol to v1
+        subvolpath = self._fs_cmd("subvolume", "getpath", self.volname, subvol, group)
+        self.assertNotEqual(subvolpath, None)
+        subvolpath = subvolpath.rstrip()
+
+        # The legacy subvolume path should be returned for subvol.
+        # Should ignore unparseable binary .meta file in subvol's root
+        self.assertEqual(createpath[1:], subvolpath)
+
+        # ensure metadata file is in legacy location, with required version v1
+        self._assert_meta_location_and_version(self.volname, subvol, subvol_group=group, version=1, legacy=True)
+
+        # remove subvolume
+        self._fs_cmd("subvolume", "rm", self.volname, subvol, group)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
+        # remove group
+        self._fs_cmd("subvolumegroup", "rm", self.volname, group)
diff --git a/qa/tasks/cephfs/xfstests_dev.py b/qa/tasks/cephfs/xfstests_dev.py
new file mode 100644
index 000000000..ea9e14b89
--- /dev/null
+++ b/qa/tasks/cephfs/xfstests_dev.py
@@ -0,0 +1,165 @@
+from io import BytesIO
+import logging
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+logger = logging.getLogger(__name__)
+
+
+# TODO: add code to run non-ACL tests too.
+# TODO: get tests running with SCRATCH_DEV and SCRATCH_DIR.
+# TODO: make xfstests-dev tests running without running `make install`.
+# TODO: make xfstests-dev compatible with ceph-fuse. xfstests-dev remounts
+# CephFS before running tests using kernel, so ceph-fuse mounts are never
+# actually testsed.
+class XFSTestsDev(CephFSTestCase):
+
+    def setUp(self):
+        super(XFSTestsDev, self).setUp()
+        self.prepare_xfstests_dev()
+
+    def prepare_xfstests_dev(self):
+        self.get_repo()
+        self.get_test_and_scratch_dirs_ready()
+        self.install_deps()
+        self.create_reqd_users()
+        self.write_local_config()
+
+        # NOTE: On teuthology machines it's necessary to run "make" as
+        # superuser since the repo is cloned somewhere in /tmp.
+        self.mount_a.client_remote.run(args=['sudo', 'make'],
+                                       cwd=self.repo_path, stdout=BytesIO(),
+                                       stderr=BytesIO())
+        self.mount_a.client_remote.run(args=['sudo', 'make', 'install'],
+                                       cwd=self.repo_path, omit_sudo=False,
+                                       stdout=BytesIO(), stderr=BytesIO())
+
+    def get_repo(self):
+        """
+        Clone xfstests_dev repository. If already present, update it.
+        """
+        # TODO: make sure that repo is not cloned for every test. it should
+        # happen only once.
+        remoteurl = 'https://git.ceph.com/xfstests-dev.git'
+        self.repo_path = self.mount_a.client_remote.mkdtemp(suffix=
+                                                            'xfstests-dev')
+        self.mount_a.run_shell(['git', 'clone', remoteurl, '--depth', '1',
+                                self.repo_path])
+
+    def get_admin_key(self):
+        import configparser
+
+        cp = configparser.ConfigParser()
+        cp.read_string(self.fs.mon_manager.raw_cluster_cmd(
+            'auth', 'get-or-create', 'client.admin'))
+
+        return cp['client.admin']['key']
+
+    def get_test_and_scratch_dirs_ready(self):
+        """ "test" and "scratch" directories are directories inside Ceph FS.
+            And, test and scratch mounts are path on the local FS where "test"
+            and "scratch" directories would be mounted. Look at xfstests-dev
+            local.config's template inside this file to get some context.
+        """
+        from os.path import join
+
+        self.test_dirname = 'test'
+        self.mount_a.run_shell(['mkdir', self.test_dirname])
+        # read var name as "test dir's mount path"
+        self.test_dirs_mount_path = self.mount_a.client_remote.mkdtemp(
+            suffix=self.test_dirname)
+        self.mount_a.run_shell(['sudo','ln','-s',join(self.mount_a.mountpoint,
+                                                      self.test_dirname),
+                                self.test_dirs_mount_path])
+
+        self.scratch_dirname = 'scratch'
+        self.mount_a.run_shell(['mkdir', self.scratch_dirname])
+        # read var name as "scratch dir's mount path"
+        self.scratch_dirs_mount_path = self.mount_a.client_remote.mkdtemp(
+            suffix=self.scratch_dirname)
+        self.mount_a.run_shell(['sudo','ln','-s',join(self.mount_a.mountpoint,
+                                                      self.scratch_dirname),
+                                self.scratch_dirs_mount_path])
+
+    def install_deps(self):
+        from teuthology.misc import get_system_type
+
+        distro, version = get_system_type(self.mount_a.client_remote,
+                                          distro=True, version=True)
+        distro = distro.lower()
+        major_ver_num = int(version.split('.')[0]) # only keep major release
+                                                   # number
+
+        # we keep fedora here so that right deps are installed when this test
+        # is run locally by a dev.
+        if distro in ('redhatenterpriseserver', 'redhatenterprise', 'fedora',
+                      'centos'):
+            deps = """acl attr automake bc dbench dump e2fsprogs fio \
+            gawk gcc indent libtool lvm2 make psmisc quota sed \
+            xfsdump xfsprogs \
+            libacl-devel libattr-devel libaio-devel libuuid-devel \
+            xfsprogs-devel btrfs-progs-devel python2 sqlite""".split()
+            deps_old_distros = ['xfsprogs-qa-devel']
+
+            if distro != 'fedora' and major_ver_num > 7:
+                    deps.remove('btrfs-progs-devel')
+
+            args = ['sudo', 'yum', 'install', '-y'] + deps + deps_old_distros
+        elif distro == 'ubuntu':
+            deps = """xfslibs-dev uuid-dev libtool-bin \
+            e2fsprogs automake gcc libuuid1 quota attr libattr1-dev make \
+            libacl1-dev libaio-dev xfsprogs libgdbm-dev gawk fio dbench \
+            uuid-runtime python sqlite3""".split()
+
+            if major_ver_num >= 19:
+                deps[deps.index('python')] ='python2'
+            args = ['sudo', 'apt-get', 'install', '-y'] + deps
+        else:
+            raise RuntimeError('expected a yum based or a apt based system')
+
+        self.mount_a.client_remote.run(args=args, omit_sudo=False)
+
+    def create_reqd_users(self):
+        self.mount_a.client_remote.run(args=['sudo', 'useradd', 'fsgqa'],
+                                       omit_sudo=False, check_status=False)
+        self.mount_a.client_remote.run(args=['sudo', 'groupadd', 'fsgqa'],
+                                       omit_sudo=False, check_status=False)
+        self.mount_a.client_remote.run(args=['sudo', 'useradd',
+                                             '123456-fsgqa'], omit_sudo=False,
+                                       check_status=False)
+
+    def write_local_config(self):
+        from os.path import join
+        from textwrap import dedent
+
+        mon_sock = self.fs.mon_manager.get_msgrv1_mon_socks()[0]
+        self.test_dev = mon_sock + ':/' + self.test_dirname
+        self.scratch_dev = mon_sock + ':/' + self.scratch_dirname
+
+        xfstests_config_contents = dedent('''\
+            export FSTYP=ceph
+            export TEST_DEV={}
+            export TEST_DIR={}
+            #export SCRATCH_DEV={}
+            #export SCRATCH_MNT={}
+            export TEST_FS_MOUNT_OPTS="-o name=admin,secret={}"
+            ''').format(self.test_dev, self.test_dirs_mount_path, self.scratch_dev,
+                        self.scratch_dirs_mount_path, self.get_admin_key())
+
+        self.mount_a.client_remote.write_file(join(self.repo_path, 'local.config'),
+                                              xfstests_config_contents, sudo=True)
+
+    def tearDown(self):
+        self.mount_a.client_remote.run(args=['sudo', 'userdel', '--force',
+                                             '--remove', 'fsgqa'],
+                                       omit_sudo=False, check_status=False)
+        self.mount_a.client_remote.run(args=['sudo', 'userdel', '--force',
+                                             '--remove', '123456-fsgqa'],
+                                       omit_sudo=False, check_status=False)
+        self.mount_a.client_remote.run(args=['sudo', 'groupdel', 'fsgqa'],
+                                       omit_sudo=False, check_status=False)
+
+        self.mount_a.client_remote.run(args=['sudo', 'rm', '-rf',
+                                             self.repo_path],
+                                       omit_sudo=False, check_status=False)
+
+        super(XFSTestsDev, self).tearDown()
diff --git a/qa/tasks/cephfs_mirror.py b/qa/tasks/cephfs_mirror.py
new file mode 100644
index 000000000..9602a5a7f
--- /dev/null
+++ b/qa/tasks/cephfs_mirror.py
@@ -0,0 +1,73 @@
+"""
+Task for running cephfs mirror daemons
+"""
+
+import logging
+
+from teuthology.orchestra import run
+from teuthology import misc
+from teuthology.exceptions import ConfigError
+from teuthology.task import Task
+from tasks.ceph_manager import get_valgrind_args
+from tasks.util import get_remote_for_role
+
+log = logging.getLogger(__name__)
+
+class CephFSMirror(Task):
+    def __init__(self, ctx, config):
+        super(CephFSMirror, self).__init__(ctx, config)
+        self.log = log
+
+    def setup(self):
+        super(CephFSMirror, self).setup()
+        try:
+            self.client = self.config['client']
+        except KeyError:
+            raise ConfigError('cephfs-mirror requires a client to connect')
+
+        self.cluster_name, type_, self.client_id = misc.split_role(self.client)
+        if not type_ == 'client':
+            raise ConfigError(f'client role {self.client} must be a client')
+        self.remote = get_remote_for_role(self.ctx, self.client)
+
+    def begin(self):
+        super(CephFSMirror, self).begin()
+        testdir = misc.get_testdir(self.ctx)
+
+        args = [
+            'adjust-ulimits',
+            'ceph-coverage',
+            '{tdir}/archive/coverage'.format(tdir=testdir),
+            'daemon-helper',
+            'term',
+            ]
+
+        if 'valgrind' in self.config:
+            args = get_valgrind_args(
+                testdir, 'cephfs-mirror-{id}'.format(id=self.client),
+                args, self.config.get('valgrind'))
+
+        args.extend([
+            'cephfs-mirror',
+            '--cluster',
+            self.cluster_name,
+            '--id',
+            self.client_id,
+            ])
+        if 'run_in_foreground' in self.config:
+            args.extend(['--foreground'])
+
+        self.ctx.daemons.add_daemon(
+            self.remote, 'cephfs-mirror', self.client,
+            args=args,
+            logger=self.log.getChild(self.client),
+            stdin=run.PIPE,
+            wait=False,
+        )
+
+    def end(self):
+        mirror_daemon = self.ctx.daemons.get_daemon('cephfs-mirror', self.client)
+        mirror_daemon.stop()
+        super(CephFSMirror, self).end()
+
+task = CephFSMirror
diff --git a/qa/tasks/cephfs_mirror_thrash.py b/qa/tasks/cephfs_mirror_thrash.py
new file mode 100644
index 000000000..91f60ac50
--- /dev/null
+++ b/qa/tasks/cephfs_mirror_thrash.py
@@ -0,0 +1,219 @@
+"""
+Task for thrashing cephfs-mirror daemons
+"""
+
+import contextlib
+import logging
+import random
+import signal
+import socket
+import time
+
+from gevent import sleep
+from gevent.greenlet import Greenlet
+from gevent.event import Event
+
+from teuthology.exceptions import CommandFailedError
+from teuthology.orchestra import run
+from tasks.thrasher import Thrasher
+
+log = logging.getLogger(__name__)
+
+
+class CephFSMirrorThrasher(Thrasher, Greenlet):
+    """
+    CephFSMirrorThrasher::
+
+    The CephFSMirrorThrasher thrashes cephfs-mirror daemons during execution of other
+    tasks (workunits, etc).
+
+    The config is optional.  Many of the config parameters are a maximum value
+    to use when selecting a random value from a range.  The config is a dict
+    containing some or all of:
+
+    cluster: [default: ceph] cluster to thrash
+
+    max_thrash: [default: 1] the maximum number of active cephfs-mirror daemons per
+      cluster will be thrashed at any given time.
+
+    min_thrash_delay: [default: 60] minimum number of seconds to delay before
+      thrashing again.
+
+    max_thrash_delay: [default: 120] maximum number of seconds to delay before
+      thrashing again.
+
+    max_revive_delay: [default: 10] maximum number of seconds to delay before
+      bringing back a thrashed cephfs-mirror daemon.
+
+    randomize: [default: true] enables randomization and use the max/min values
+
+    seed: [no default] seed the random number generator
+
+    Examples::
+
+      The following example disables randomization, and uses the max delay
+      values:
+
+      tasks:
+      - ceph:
+      - cephfs_mirror_thrash:
+          randomize: False
+          max_thrash_delay: 10
+    """
+
+    def __init__(self, ctx, config, cluster, daemons):
+        super(CephFSMirrorThrasher, self).__init__()
+
+        self.ctx = ctx
+        self.config = config
+        self.cluster = cluster
+        self.daemons = daemons
+
+        self.logger = log
+        self.name = 'thrasher.cephfs_mirror.[{cluster}]'.format(cluster = cluster)
+        self.stopping = Event()
+
+        self.randomize = bool(self.config.get('randomize', True))
+        self.max_thrash = int(self.config.get('max_thrash', 1))
+        self.min_thrash_delay = float(self.config.get('min_thrash_delay', 5.0))
+        self.max_thrash_delay = float(self.config.get('max_thrash_delay', 10))
+        self.max_revive_delay = float(self.config.get('max_revive_delay', 15.0))
+
+    def _run(self):
+        try:
+            self.do_thrash()
+        except Exception as e:
+            # See _run exception comment for MDSThrasher
+            self.set_thrasher_exception(e)
+            self.logger.exception("exception:")
+            # Allow successful completion so gevent doesn't see an exception.
+            # The DaemonWatchdog will observe the error and tear down the test.
+
+    def log(self, x):
+        """Write data to logger assigned to this CephFSMirrorThrasher"""
+        self.logger.info(x)
+
+    def stop(self):
+        self.stopping.set()
+
+    def do_thrash(self):
+        """
+        Perform the random thrashing action
+        """
+
+        self.log('starting thrash for cluster {cluster}'.format(cluster=self.cluster))
+        stats = {
+            "kill": 0,
+        }
+
+        while not self.stopping.is_set():
+            delay = self.max_thrash_delay
+            if self.randomize:
+                delay = random.randrange(self.min_thrash_delay, self.max_thrash_delay)
+
+            if delay > 0.0:
+                self.log('waiting for {delay} secs before thrashing'.format(delay=delay))
+                self.stopping.wait(delay)
+                if self.stopping.is_set():
+                    continue
+
+            killed_daemons = []
+
+            weight = 1.0 / len(self.daemons)
+            count = 0
+            for daemon in self.daemons:
+                skip = random.uniform(0.0, 1.0)
+                if weight <= skip:
+                    self.log('skipping daemon {label} with skip ({skip}) > weight ({weight})'.format(
+                        label=daemon.id_, skip=skip, weight=weight))
+                    continue
+
+                self.log('kill {label}'.format(label=daemon.id_))
+                try:
+                    daemon.signal(signal.SIGTERM)
+                except Exception as e:
+                    self.log(f'exception when stopping mirror daemon: {e}')
+                else:
+                    killed_daemons.append(daemon)
+                    stats['kill'] += 1
+
+                # if we've reached max_thrash, we're done
+                count += 1
+                if count >= self.max_thrash:
+                    break
+
+            if killed_daemons:
+                # wait for a while before restarting
+                delay = self.max_revive_delay
+                if self.randomize:
+                    delay = random.randrange(0.0, self.max_revive_delay)
+
+                self.log('waiting for {delay} secs before reviving daemons'.format(delay=delay))
+                sleep(delay)
+
+                for daemon in killed_daemons:
+                    self.log('waiting for {label}'.format(label=daemon.id_))
+                    try:
+                        run.wait([daemon.proc], timeout=600)
+                    except CommandFailedError:
+                        pass
+                    except:
+                        self.log('Failed to stop {label}'.format(label=daemon.id_))
+
+                        try:
+                            # try to capture a core dump
+                            daemon.signal(signal.SIGABRT)
+                        except socket.error:
+                            pass
+                        raise
+                    finally:
+                        daemon.reset()
+
+                for daemon in killed_daemons:
+                    self.log('reviving {label}'.format(label=daemon.id_))
+                    daemon.start()
+
+        for stat in stats:
+            self.log("stat['{key}'] = {value}".format(key = stat, value = stats[stat]))
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Stress test the cephfs-mirror by thrashing while another task/workunit
+    is running.
+
+    Please refer to CephFSMirrorThrasher class for further information on the
+    available options.
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'cephfs_mirror_thrash task only accepts a dict for configuration'
+
+    cluster = config.get('cluster', 'ceph')
+    daemons = list(ctx.daemons.iter_daemons_of_role('cephfs-mirror', cluster))
+    assert len(daemons) > 0, \
+        'cephfs_mirror_thrash task requires at least 1 cephfs-mirror daemon'
+
+    # choose random seed
+    if 'seed' in config:
+        seed = int(config['seed'])
+    else:
+        seed = int(time.time())
+    log.info('cephfs_mirror_thrash using random seed: {seed}'.format(seed=seed))
+    random.seed(seed)
+
+    thrasher = CephFSMirrorThrasher(ctx, config, cluster, daemons)
+    thrasher.start()
+    ctx.ceph[cluster].thrashers.append(thrasher)
+
+    try:
+        log.debug('Yielding')
+        yield
+    finally:
+        log.info('joining cephfs_mirror_thrash')
+        thrasher.stop()
+        if thrasher.exception is not None:
+            raise RuntimeError('error during thrashing')
+        thrasher.join()
+        log.info('done joining')
diff --git a/qa/tasks/cephfs_test_runner.py b/qa/tasks/cephfs_test_runner.py
new file mode 100644
index 000000000..8a4919b93
--- /dev/null
+++ b/qa/tasks/cephfs_test_runner.py
@@ -0,0 +1,213 @@
+import contextlib
+import logging
+import os
+import unittest
+from unittest import suite, loader, case
+from teuthology.task import interactive
+from teuthology import misc
+from tasks.cephfs.filesystem import Filesystem, MDSCluster, CephCluster
+from tasks.mgr.mgr_test_case import MgrCluster
+
+log = logging.getLogger(__name__)
+
+
+class DecoratingLoader(loader.TestLoader):
+    """
+    A specialization of TestLoader that tags some extra attributes
+    onto test classes as they are loaded.
+    """
+    def __init__(self, params):
+        self._params = params
+        super(DecoratingLoader, self).__init__()
+
+    def _apply_params(self, obj):
+        for k, v in self._params.items():
+            if obj.__class__ is type:
+                cls = obj
+            else:
+                cls = obj.__class__
+            setattr(cls, k, v)
+
+    def loadTestsFromTestCase(self, testCaseClass):
+        self._apply_params(testCaseClass)
+        return super(DecoratingLoader, self).loadTestsFromTestCase(testCaseClass)
+
+    def loadTestsFromName(self, name, module=None):
+        result = super(DecoratingLoader, self).loadTestsFromName(name, module)
+
+        # Special case for when we were called with the name of a method, we get
+        # a suite with one TestCase
+        tests_in_result = list(result)
+        if len(tests_in_result) == 1 and isinstance(tests_in_result[0], case.TestCase):
+            self._apply_params(tests_in_result[0])
+
+        return result
+
+
+class LogStream(object):
+    def __init__(self):
+        self.buffer = ""
+
+    def write(self, data):
+        self.buffer += data
+        if "\n" in self.buffer:
+            lines = self.buffer.split("\n")
+            for line in lines[:-1]:
+                log.info(line)
+            self.buffer = lines[-1]
+
+    def flush(self):
+        pass
+
+
+class InteractiveFailureResult(unittest.TextTestResult):
+    """
+    Specialization that implements interactive-on-error style
+    behavior.
+    """
+    ctx = None
+
+    def addFailure(self, test, err):
+        log.error(self._exc_info_to_string(err, test))
+        log.error("Failure in test '{0}', going interactive".format(
+            self.getDescription(test)
+        ))
+        interactive.task(ctx=self.ctx, config=None)
+
+    def addError(self, test, err):
+        log.error(self._exc_info_to_string(err, test))
+        log.error("Error in test '{0}', going interactive".format(
+            self.getDescription(test)
+        ))
+        interactive.task(ctx=self.ctx, config=None)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run the CephFS test cases.
+
+    Run everything in tasks/cephfs/test_*.py:
+
+    ::
+
+        tasks:
+          - install:
+          - ceph:
+          - ceph-fuse:
+          - cephfs_test_runner:
+
+    `modules` argument allows running only some specific modules:
+
+    ::
+
+        tasks:
+            ...
+          - cephfs_test_runner:
+              modules:
+                - tasks.cephfs.test_sessionmap
+                - tasks.cephfs.test_auto_repair
+
+    By default, any cases that can't be run on the current cluster configuration
+    will generate a failure.  When the optional `fail_on_skip` argument is set
+    to false, any tests that can't be run on the current configuration will
+    simply be skipped:
+
+    ::
+        tasks:
+            ...
+         - cephfs_test_runner:
+           fail_on_skip: false
+
+    """
+
+    ceph_cluster = CephCluster(ctx)
+
+    if len(list(misc.all_roles_of_type(ctx.cluster, 'mds'))):
+        mds_cluster = MDSCluster(ctx)
+        fs = Filesystem(ctx)
+    else:
+        mds_cluster = None
+        fs = None
+
+    if len(list(misc.all_roles_of_type(ctx.cluster, 'mgr'))):
+        mgr_cluster = MgrCluster(ctx)
+    else:
+        mgr_cluster = None
+
+    # Mount objects, sorted by ID
+    if hasattr(ctx, 'mounts'):
+        mounts = [v for k, v in sorted(ctx.mounts.items(), key=lambda mount: mount[0])]
+    else:
+        # The test configuration has a filesystem but no fuse/kclient mounts
+        mounts = []
+
+    decorating_loader = DecoratingLoader({
+        "ctx": ctx,
+        "mounts": mounts,
+        "fs": fs,
+        "ceph_cluster": ceph_cluster,
+        "mds_cluster": mds_cluster,
+        "mgr_cluster": mgr_cluster,
+    })
+
+    fail_on_skip = config.get('fail_on_skip', True)
+
+    # Put useful things onto ctx for interactive debugging
+    ctx.fs = fs
+    ctx.mds_cluster = mds_cluster
+    ctx.mgr_cluster = mgr_cluster
+
+    # Depending on config, either load specific modules, or scan for moduless
+    if config and 'modules' in config and config['modules']:
+        module_suites = []
+        for mod_name in config['modules']:
+            # Test names like cephfs.test_auto_repair
+            module_suites.append(decorating_loader.loadTestsFromName(mod_name))
+        overall_suite = suite.TestSuite(module_suites)
+    else:
+        # Default, run all tests
+        overall_suite = decorating_loader.discover(
+            os.path.join(
+                os.path.dirname(os.path.abspath(__file__)),
+                "cephfs/"
+            )
+        )
+
+    if ctx.config.get("interactive-on-error", False):
+        InteractiveFailureResult.ctx = ctx
+        result_class = InteractiveFailureResult
+    else:
+        result_class = unittest.TextTestResult
+
+    class LoggingResult(result_class):
+        def startTest(self, test):
+            log.info("Starting test: {0}".format(self.getDescription(test)))
+            return super(LoggingResult, self).startTest(test)
+
+        def addSkip(self, test, reason):
+            if fail_on_skip:
+                # Don't just call addFailure because that requires a traceback
+                self.failures.append((test, reason))
+            else:
+                super(LoggingResult, self).addSkip(test, reason)
+
+    # Execute!
+    result = unittest.TextTestRunner(
+        stream=LogStream(),
+        resultclass=LoggingResult,
+        verbosity=2,
+        failfast=True).run(overall_suite)
+
+    if not result.wasSuccessful():
+        result.printErrors()  # duplicate output at end for convenience
+
+        bad_tests = []
+        for test, error in result.errors:
+            bad_tests.append(str(test))
+        for test, failure in result.failures:
+            bad_tests.append(str(test))
+
+        raise RuntimeError("Test failure: {0}".format(", ".join(bad_tests)))
+
+    yield
diff --git a/qa/tasks/cephfs_upgrade_snap.py b/qa/tasks/cephfs_upgrade_snap.py
new file mode 100644
index 000000000..1b0a737a7
--- /dev/null
+++ b/qa/tasks/cephfs_upgrade_snap.py
@@ -0,0 +1,47 @@
+"""
+Upgrade cluster snap format.
+"""
+
+import logging
+import time
+
+from tasks.cephfs.filesystem import Filesystem
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Upgrade CephFS file system snap format.
+    """
+
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'snap-upgrade task only accepts a dict for configuration'
+
+    fs = Filesystem(ctx)
+
+    mds_map = fs.get_mds_map()
+    assert(mds_map['max_mds'] == 1)
+
+    json = fs.run_scrub(["start", "/", "force", "recursive", "repair"])
+    if not json or json['return_code'] == 0:
+        assert(fs.wait_until_scrub_complete(tag=json["scrub_tag"]) == True)
+        log.info("scrub / completed")
+    else:
+        log.info("scrub / failed: {}".format(json))
+
+    json = fs.run_scrub(["start", "~mdsdir", "force", "recursive", "repair"])
+    if not json or json['return_code'] == 0:
+        assert(fs.wait_until_scrub_complete(tag=json["scrub_tag"]) == True)
+        log.info("scrub ~mdsdir completed")
+    else:
+        log.info("scrub / failed: {}".format(json))
+
+    for i in range(0, 10):
+        mds_map = fs.get_mds_map()
+        if (mds_map['flags'] & (1<<1)) != 0 and (mds_map['flags'] & (1<<4)) != 0:
+            break
+        time.sleep(10)
+    assert((mds_map['flags'] & (1<<1)) != 0) # Test CEPH_MDSMAP_ALLOW_SNAPS
+    assert((mds_map['flags'] & (1<<4)) != 0) # Test CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS
diff --git a/qa/tasks/check_counter.py b/qa/tasks/check_counter.py
new file mode 100644
index 000000000..daa81973b
--- /dev/null
+++ b/qa/tasks/check_counter.py
@@ -0,0 +1,98 @@
+
+import logging
+import json
+
+from teuthology.task import Task
+from teuthology import misc
+
+log = logging.getLogger(__name__)
+
+
+class CheckCounter(Task):
+    """
+    Use this task to validate that some daemon perf counters were
+    incremented by the nested tasks.
+
+    Config:
+     'cluster_name': optional, specify which cluster
+     'target': dictionary of daemon type to list of performance counters.
+     'dry_run': just log the value of the counters, don't fail if they
+                aren't nonzero.
+
+    Success condition is that for all of the named counters, at least
+    one of the daemons of that type has the counter nonzero.
+
+    Example to check cephfs dirfrag splits are happening:
+    - install:
+    - ceph:
+    - ceph-fuse:
+    - check-counter:
+        counters:
+            mds:
+                - "mds.dir_split"
+    - workunit: ...
+    """
+
+    def start(self):
+        log.info("START")
+
+    def end(self):
+        overrides = self.ctx.config.get('overrides', {})
+        misc.deep_merge(self.config, overrides.get('check-counter', {}))
+
+        cluster_name = self.config.get('cluster_name', None)
+        dry_run = self.config.get('dry_run', False)
+        targets = self.config.get('counters', {})
+
+        if cluster_name is None:
+            cluster_name = next(iter(self.ctx.managers.keys()))
+
+        for daemon_type, counters in targets.items():
+            # List of 'a', 'b', 'c'...
+            daemon_ids = list(misc.all_roles_of_type(self.ctx.cluster, daemon_type))
+            daemons = dict([(daemon_id,
+                             self.ctx.daemons.get_daemon(daemon_type, daemon_id))
+                            for daemon_id in daemon_ids])
+
+            seen = set()
+
+            for daemon_id, daemon in daemons.items():
+                if not daemon.running():
+                    log.info("Ignoring daemon {0}, it isn't running".format(daemon_id))
+                    continue
+                else:
+                    log.debug("Getting stats from {0}".format(daemon_id))
+
+                manager = self.ctx.managers[cluster_name]
+                proc = manager.admin_socket(daemon_type, daemon_id, ["perf", "dump"])
+                response_data = proc.stdout.getvalue().strip()
+                if response_data:
+                    perf_dump = json.loads(response_data)
+                else:
+                    log.warning("No admin socket response from {0}, skipping".format(daemon_id))
+                    continue
+
+                for counter in counters:
+                    subsys, counter_id = counter.split(".")
+                    if subsys not in perf_dump or counter_id not in perf_dump[subsys]:
+                        log.warning("Counter '{0}' not found on daemon {1}.{2}".format(
+                            counter, daemon_type, daemon_id))
+                        continue
+                    value = perf_dump[subsys][counter_id]
+
+                    log.info("Daemon {0}.{1} {2}={3}".format(
+                        daemon_type, daemon_id, counter, value
+                    ))
+
+                    if value > 0:
+                        seen.add(counter)
+
+            if not dry_run:
+                unseen = set(counters) - set(seen)
+                if unseen:
+                    raise RuntimeError("The following counters failed to be set "
+                                       "on {0} daemons: {1}".format(
+                        daemon_type, unseen
+                    ))
+
+task = CheckCounter
diff --git a/qa/tasks/cifs_mount.py b/qa/tasks/cifs_mount.py
new file mode 100644
index 000000000..b282b0b7d
--- /dev/null
+++ b/qa/tasks/cifs_mount.py
@@ -0,0 +1,137 @@
+"""
+Mount cifs clients.  Unmount when finished.
+"""
+import contextlib
+import logging
+import os
+
+from teuthology import misc as teuthology
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Mount/unmount a cifs client.
+
+    The config is optional and defaults to mounting on all clients. If
+    a config is given, it is expected to be a list of clients to do
+    this operation on.
+
+    Example that starts smbd and mounts cifs on all nodes::
+
+        tasks:
+        - ceph:
+        - samba:
+        - cifs-mount:
+        - interactive:
+
+    Example that splits smbd and cifs:
+
+        tasks:
+        - ceph:
+        - samba: [samba.0]
+        - cifs-mount: [client.0]
+        - ceph-fuse: [client.1]
+        - interactive:
+
+    Example that specifies the share name:
+
+        tasks:
+        - ceph:
+        - ceph-fuse:
+        - samba:
+            samba.0:
+                cephfuse: "{testdir}/mnt.0"
+        - cifs-mount:
+            client.0:
+                share: cephfuse
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    log.info('Mounting cifs clients...')
+
+    if config is None:
+        config = dict(('client.{id}'.format(id=id_), None)
+                  for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client'))
+    elif isinstance(config, list):
+        config = dict((name, None) for name in config)
+
+    clients = list(teuthology.get_clients(ctx=ctx, roles=config.keys()))
+
+    from .samba import get_sambas
+    samba_roles = ['samba.{id_}'.format(id_=id_) for id_ in teuthology.all_roles_of_type(ctx.cluster, 'samba')]
+    sambas = list(get_sambas(ctx=ctx, roles=samba_roles))
+    (ip, _) = sambas[0][1].ssh.get_transport().getpeername()
+    log.info('samba ip: {ip}'.format(ip=ip))
+
+    for id_, remote in clients:
+        mnt = os.path.join(teuthology.get_testdir(ctx), 'mnt.{id}'.format(id=id_))
+        log.info('Mounting cifs client.{id} at {remote} {mnt}...'.format(
+                id=id_, remote=remote,mnt=mnt))
+
+        remote.run(
+            args=[
+                'mkdir',
+                '--',
+                mnt,
+                ],
+            )
+
+        rolestr = 'client.{id_}'.format(id_=id_)
+        unc = "ceph"
+        log.info("config: {c}".format(c=config))
+        if config[rolestr] is not None and 'share' in config[rolestr]:
+            unc = config[rolestr]['share']
+
+        remote.run(
+            args=[
+                'sudo',
+                'mount',
+                '-t',
+                'cifs',
+                '//{sambaip}/{unc}'.format(sambaip=ip, unc=unc),
+                '-o',
+                'username=ubuntu,password=ubuntu',
+                mnt,
+                ],
+            )
+
+        remote.run(
+            args=[
+                'sudo',
+                'chown',
+                'ubuntu:ubuntu',
+                '{m}/'.format(m=mnt),
+                ],
+            )
+
+    try:
+        yield
+    finally:
+        log.info('Unmounting cifs clients...')
+        for id_, remote in clients:
+            remote.run(
+                args=[
+                    'sudo',
+                    'umount',
+                    mnt,
+                    ],
+                )
+        for id_, remote in clients:
+            while True:
+                try:
+                    remote.run(
+                        args=[
+                            'rmdir', '--', mnt,
+                            run.Raw('2>&1'),
+                            run.Raw('|'),
+                            'grep', 'Device or resource busy',
+                            ],
+                        )
+                    import time
+                    time.sleep(1)
+                except Exception:
+                    break
diff --git a/qa/tasks/cram.py b/qa/tasks/cram.py
new file mode 100644
index 000000000..a445a146f
--- /dev/null
+++ b/qa/tasks/cram.py
@@ -0,0 +1,160 @@
+"""
+Cram tests
+"""
+import logging
+import os
+
+from tasks.util.workunit import get_refspec_after_overrides
+
+from teuthology import misc as teuthology
+from teuthology.parallel import parallel
+from teuthology.orchestra import run
+from teuthology.config import config as teuth_config
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Run all cram tests from the specified paths on the specified
+    clients. Each client runs tests in parallel as default, and
+    you can also disable it by adding "parallel: False" option.
+
+    Limitations:
+    Tests must have a .t suffix. Tests with duplicate names will
+    overwrite each other, so only the last one will run.
+
+    For example::
+
+        tasks:
+        - ceph:
+        - cram:
+            clients:
+              client.0:
+              - qa/test.t
+              - qa/test2.t]
+              client.1: [qa/test.t]
+            branch: foo
+            parallel: False
+
+    You can also run a list of cram tests on all clients::
+
+        tasks:
+        - ceph:
+        - cram:
+            clients:
+              all: [qa/test.t]
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    assert isinstance(config, dict)
+    assert 'clients' in config and isinstance(config['clients'], dict), \
+           'configuration must contain a dictionary of clients'
+
+    clients = teuthology.replace_all_with_clients(ctx.cluster,
+                                                  config['clients'])
+    testdir = teuthology.get_testdir(ctx)
+
+    overrides = ctx.config.get('overrides', {})
+    refspec = get_refspec_after_overrides(config, overrides)
+
+    _parallel = config.get('parallel', True)
+
+    git_url = teuth_config.get_ceph_qa_suite_git_url()
+    log.info('Pulling tests from %s ref %s', git_url, refspec)
+
+    try:
+        for client, tests in clients.items():
+            (remote,) = (ctx.cluster.only(client).remotes.keys())
+            client_dir = '{tdir}/archive/cram.{role}'.format(tdir=testdir, role=client)
+            remote.run(
+                args=[
+                    'mkdir', '--', client_dir,
+                    run.Raw('&&'),
+                    'python3', '-m', 'venv', '{tdir}/virtualenv'.format(tdir=testdir),
+                    run.Raw('&&'),
+                    '{tdir}/virtualenv/bin/pip'.format(tdir=testdir),
+                    'install', 'cram==0.6',
+                    ],
+                )
+            clone_dir = '{tdir}/clone.{role}'.format(tdir=testdir, role=client)
+            remote.run(args=refspec.clone(git_url, clone_dir))
+
+            for test in tests:
+                assert test.endswith('.t'), 'tests must end in .t'
+                remote.run(
+                    args=[
+                        'cp', '--', os.path.join(clone_dir, test), client_dir,
+                        ],
+                    )
+
+        if _parallel:
+            with parallel() as p:
+                for role in clients.keys():
+                    p.spawn(_run_tests, ctx, role)
+        else:
+            for role in clients.keys():
+                _run_tests(ctx, role)
+    finally:
+        for client, tests in clients.items():
+            (remote,) = (ctx.cluster.only(client).remotes.keys())
+            client_dir = '{tdir}/archive/cram.{role}'.format(tdir=testdir, role=client)
+            test_files = set([test.rsplit('/', 1)[1] for test in tests])
+
+            # remove test files unless they failed
+            for test_file in test_files:
+                abs_file = os.path.join(client_dir, test_file)
+                remote.run(
+                    args=[
+                        'test', '-f', abs_file + '.err',
+                        run.Raw('||'),
+                        'rm', '-f', '--', abs_file,
+                        ],
+                    )
+
+            # ignore failure since more than one client may
+            # be run on a host, and the client dir should be
+            # non-empty if the test failed
+            clone_dir = '{tdir}/clone.{role}'.format(tdir=testdir, role=client)
+            remote.run(
+                args=[
+                    'rm', '-rf', '--',
+                    '{tdir}/virtualenv'.format(tdir=testdir),
+                    clone_dir,
+                    run.Raw(';'),
+                    'rmdir', '--ignore-fail-on-non-empty', client_dir,
+                    ],
+                )
+
+def _run_tests(ctx, role):
+    """
+    For each role, check to make sure it's a client, then run the cram on that client
+
+    :param ctx: Context
+    :param role: Roles
+    """
+    assert isinstance(role, str)
+    PREFIX = 'client.'
+    if role.startswith(PREFIX):
+        id_ = role[len(PREFIX):]
+    else:
+        id_ = role
+    (remote,) = (ctx.cluster.only(role).remotes.keys())
+    ceph_ref = ctx.summary.get('ceph-sha1', 'master')
+
+    testdir = teuthology.get_testdir(ctx)
+    log.info('Running tests for %s...', role)
+    remote.run(
+        args=[
+            run.Raw('CEPH_REF={ref}'.format(ref=ceph_ref)),
+            run.Raw('CEPH_ID="{id}"'.format(id=id_)),
+            run.Raw('PATH=$PATH:/usr/sbin'),
+            'adjust-ulimits',
+            'ceph-coverage',
+            '{tdir}/archive/coverage'.format(tdir=testdir),
+            '{tdir}/virtualenv/bin/cram'.format(tdir=testdir),
+            '-v', '--',
+            run.Raw('{tdir}/archive/cram.{role}/*.t'.format(tdir=testdir, role=role)),
+            ],
+        logger=log.getChild(role),
+        )
diff --git a/qa/tasks/create_verify_lfn_objects.py b/qa/tasks/create_verify_lfn_objects.py
new file mode 100644
index 000000000..532541581
--- /dev/null
+++ b/qa/tasks/create_verify_lfn_objects.py
@@ -0,0 +1,83 @@
+"""
+Rados modle-based integration tests
+"""
+import contextlib
+import logging
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    For each combination of namespace and name_length, create
+    <num_objects> objects with name length <name_length>
+    on entry.  On exit, verify that the objects still exist, can
+    be deleted, and then don't exist.
+
+    Usage::
+
+       create_verify_lfn_objects.py:
+         pool: <pool_name> default: 'data'
+         prefix: <prefix> default: ''
+         namespace: [<namespace>] default: ['']
+         num_objects: [<num_objects>] default: 10
+         name_length: [<name_length>] default: [400]
+    """
+    pool = config.get('pool', 'data')
+    num_objects = config.get('num_objects', 10)
+    name_length = config.get('name_length', [400])
+    namespace = config.get('namespace', [None])
+    prefix = config.get('prefix', None)
+    manager = ctx.managers['ceph']
+
+    objects = []
+    for l in name_length:
+        for ns in namespace:
+            def object_name(i):
+                nslength = 0
+                if namespace != '':
+                    nslength = len(namespace)
+                numstr = str(i)
+                fillerlen = l - nslength - len(prefix) - len(numstr)
+                assert fillerlen >= 0
+                return prefix + ('a'*fillerlen) + numstr
+            objects += [(ns, object_name(i)) for i in  range(num_objects)]
+
+    for ns, name in objects:
+        err = manager.do_put(
+            pool,
+            name,
+            '/etc/resolv.conf',
+            namespace=ns)
+        log.info("err is " + str(err))
+        assert err == 0
+
+    try:
+        yield
+    finally:
+        log.info('ceph_verify_lfn_objects verifying...')
+        for ns, name in objects:
+            err = manager.do_get(
+                pool,
+                name,
+                namespace=ns)
+            log.info("err is " + str(err))
+            assert err == 0
+
+        log.info('ceph_verify_lfn_objects deleting...')
+        for ns, name in objects:
+            err = manager.do_rm(
+                pool,
+                name,
+                namespace=ns)
+            log.info("err is " + str(err))
+            assert err == 0
+
+        log.info('ceph_verify_lfn_objects verifying absent...')
+        for ns, name in objects:
+            err = manager.do_get(
+                pool,
+                name,
+                namespace=ns)
+            log.info("err is " + str(err))
+            assert err != 0
diff --git a/qa/tasks/daemonwatchdog.py b/qa/tasks/daemonwatchdog.py
new file mode 100644
index 000000000..c8fa9f3c2
--- /dev/null
+++ b/qa/tasks/daemonwatchdog.py
@@ -0,0 +1,128 @@
+import logging
+import signal
+import time
+
+from gevent import sleep
+from gevent.greenlet import Greenlet
+from gevent.event import Event
+
+log = logging.getLogger(__name__)
+
+class DaemonWatchdog(Greenlet):
+    """
+    DaemonWatchdog::
+
+    Watch Ceph daemons for failures. If an extended failure is detected (i.e.
+    not intentional), then the watchdog will unmount file systems and send
+    SIGTERM to all daemons. The duration of an extended failure is configurable
+    with watchdog_daemon_timeout.
+
+    ceph:
+      watchdog:
+        daemon_restart [default: no]: restart daemon if "normal" exit (status==0).
+
+        daemon_timeout [default: 300]: number of seconds a daemon
+                                              is allowed to be failed before the
+                                              watchdog will bark.
+    """
+
+    def __init__(self, ctx, config, thrashers):
+        super(DaemonWatchdog, self).__init__()
+        self.config = ctx.config.get('watchdog', {})
+        self.ctx = ctx
+        self.e = None
+        self.logger = log.getChild('daemon_watchdog')
+        self.cluster = config.get('cluster', 'ceph')
+        self.name = 'watchdog'
+        self.stopping = Event()
+        self.thrashers = thrashers
+
+    def _run(self):
+        try:
+            self.watch()
+        except Exception as e:
+            # See _run exception comment for MDSThrasher
+            self.e = e
+            self.logger.exception("exception:")
+            # allow successful completion so gevent doesn't see an exception...
+
+    def log(self, x):
+        """Write data to logger"""
+        self.logger.info(x)
+
+    def stop(self):
+        self.stopping.set()
+
+    def bark(self):
+        self.log("BARK! unmounting mounts and killing all daemons")
+        if hasattr(self.ctx, 'mounts'):
+            for mount in self.ctx.mounts.values():
+                try:
+                    mount.umount_wait(force=True)
+                except:
+                    self.logger.exception("ignoring exception:")
+        daemons = []
+        daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('osd', cluster=self.cluster)))
+        daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.cluster)))
+        daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.cluster)))
+        daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('rgw', cluster=self.cluster)))
+        daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mgr', cluster=self.cluster)))
+
+        for daemon in daemons:
+            try:
+                daemon.signal(signal.SIGTERM)
+            except:
+                self.logger.exception("ignoring exception:")
+
+    def watch(self):
+        self.log("watchdog starting")
+        daemon_timeout = int(self.config.get('daemon_timeout', 300))
+        daemon_restart = self.config.get('daemon_restart', False)
+        daemon_failure_time = {}
+        while not self.stopping.is_set():
+            bark = False
+            now = time.time()
+
+            osds = self.ctx.daemons.iter_daemons_of_role('osd', cluster=self.cluster)
+            mons = self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.cluster)
+            mdss = self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.cluster)
+            rgws = self.ctx.daemons.iter_daemons_of_role('rgw', cluster=self.cluster)
+            mgrs = self.ctx.daemons.iter_daemons_of_role('mgr', cluster=self.cluster)
+
+            daemon_failures = []
+            daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, osds))
+            daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mons))
+            daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mdss))
+            daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, rgws))
+            daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mgrs))
+
+            for daemon in daemon_failures:
+                name = daemon.role + '.' + daemon.id_
+                dt = daemon_failure_time.setdefault(name, (daemon, now))
+                assert dt[0] is daemon
+                delta = now-dt[1]
+                self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta))
+                if delta > daemon_timeout:
+                    bark = True
+                if daemon_restart == 'normal' and daemon.proc.exitstatus == 0:
+                    self.log(f"attempting to restart daemon {name}")
+                    daemon.restart()
+
+            # If a daemon is no longer failed, remove it from tracking:
+            for name in list(daemon_failure_time.keys()):
+                if name not in [d.role + '.' + d.id_ for d in daemon_failures]:
+                    self.log("daemon {name} has been restored".format(name=name))
+                    del daemon_failure_time[name]
+
+            for thrasher in self.thrashers:
+                if thrasher.exception is not None:
+                    self.log("{name} failed".format(name=thrasher.name))
+                    bark = True
+
+            if bark:
+                self.bark()
+                return
+
+            sleep(5)
+
+        self.log("watchdog finished")
diff --git a/qa/tasks/devstack.py b/qa/tasks/devstack.py
new file mode 100644
index 000000000..2499e9e53
--- /dev/null
+++ b/qa/tasks/devstack.py
@@ -0,0 +1,371 @@
+#!/usr/bin/env python
+import contextlib
+import logging
+import textwrap
+import time
+from configparser import ConfigParser
+from io import BytesIO, StringIO
+
+from teuthology.orchestra import run
+from teuthology import misc
+from teuthology.contextutil import nested
+
+log = logging.getLogger(__name__)
+
+DEVSTACK_GIT_REPO = 'https://github.com/openstack-dev/devstack.git'
+DS_STABLE_BRANCHES = ("havana", "grizzly")
+
+is_devstack_node = lambda role: role.startswith('devstack')
+is_osd_node = lambda role: role.startswith('osd')
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    if config is None:
+        config = {}
+    if not isinstance(config, dict):
+        raise TypeError("config must be a dict")
+    with nested(lambda: install(ctx=ctx, config=config),
+                lambda: smoke(ctx=ctx, config=config),
+                ):
+        yield
+
+
+@contextlib.contextmanager
+def install(ctx, config):
+    """
+    Install OpenStack DevStack and configure it to use a Ceph cluster for
+    Glance and Cinder.
+
+    Requires one node with a role 'devstack'
+
+    Since devstack runs rampant on the system it's used on, typically you will
+    want to reprovision that machine after using devstack on it.
+
+    Also, the default 2GB of RAM that is given to vps nodes is insufficient. I
+    recommend 4GB. Downburst can be instructed to give 4GB to a vps node by
+    adding this to the yaml:
+
+    downburst:
+        ram: 4G
+
+    This was created using documentation found here:
+        https://github.com/openstack-dev/devstack/blob/master/README.md
+        http://docs.ceph.com/en/latest/rbd/rbd-openstack/
+    """
+    if config is None:
+        config = {}
+    if not isinstance(config, dict):
+        raise TypeError("config must be a dict")
+
+    devstack_node = next(iter(ctx.cluster.only(is_devstack_node).remotes.keys()))
+    an_osd_node = next(iter(ctx.cluster.only(is_osd_node).remotes.keys()))
+
+    devstack_branch = config.get("branch", "master")
+    install_devstack(devstack_node, devstack_branch)
+    try:
+        configure_devstack_and_ceph(ctx, config, devstack_node, an_osd_node)
+        yield
+    finally:
+        pass
+
+
+def install_devstack(devstack_node, branch="master"):
+    log.info("Cloning DevStack repo...")
+
+    args = ['git', 'clone', DEVSTACK_GIT_REPO]
+    devstack_node.run(args=args)
+
+    if branch != "master":
+        if branch in DS_STABLE_BRANCHES and not branch.startswith("stable"):
+            branch = "stable/" + branch
+        log.info("Checking out {branch} branch...".format(branch=branch))
+        cmd = "cd devstack && git checkout " + branch
+        devstack_node.run(args=cmd)
+
+    log.info("Installing DevStack...")
+    args = ['cd', 'devstack', run.Raw('&&'), './stack.sh']
+    devstack_node.run(args=args)
+
+
+def configure_devstack_and_ceph(ctx, config, devstack_node, ceph_node):
+    pool_size = config.get('pool_size', '128')
+    create_pools(ceph_node, pool_size)
+    distribute_ceph_conf(devstack_node, ceph_node)
+    # This is where we would install python-ceph and ceph-common but it appears
+    # the ceph task does that for us.
+    generate_ceph_keys(ceph_node)
+    distribute_ceph_keys(devstack_node, ceph_node)
+    secret_uuid = set_libvirt_secret(devstack_node, ceph_node)
+    update_devstack_config_files(devstack_node, secret_uuid)
+    set_apache_servername(devstack_node)
+    # Rebooting is the most-often-used method of restarting devstack services
+    misc.reboot(devstack_node)
+    start_devstack(devstack_node)
+    restart_apache(devstack_node)
+
+
+def create_pools(ceph_node, pool_size):
+    log.info("Creating pools on Ceph cluster...")
+
+    for pool_name in ['volumes', 'images', 'backups']:
+        args = ['sudo', 'ceph', 'osd', 'pool', 'create', pool_name, pool_size]
+        ceph_node.run(args=args)
+
+
+def distribute_ceph_conf(devstack_node, ceph_node):
+    log.info("Copying ceph.conf to DevStack node...")
+
+    ceph_conf_path = '/etc/ceph/ceph.conf'
+    ceph_conf = ceph_node.read_file(ceph_conf_path, sudo=True)
+    devstack_node.write_file(ceph_conf_path, ceph_conf, sudo=True)
+
+
+def generate_ceph_keys(ceph_node):
+    log.info("Generating Ceph keys...")
+
+    ceph_auth_cmds = [
+        ['sudo', 'ceph', 'auth', 'get-or-create', 'client.cinder', 'mon',
+            'allow r', 'osd', 'allow class-read object_prefix rbd_children, allow rwx pool=volumes, allow rx pool=images'],  # noqa
+        ['sudo', 'ceph', 'auth', 'get-or-create', 'client.glance', 'mon',
+            'allow r', 'osd', 'allow class-read object_prefix rbd_children, allow rwx pool=images'],  # noqa
+        ['sudo', 'ceph', 'auth', 'get-or-create', 'client.cinder-backup', 'mon',
+            'allow r', 'osd', 'allow class-read object_prefix rbd_children, allow rwx pool=backups'],  # noqa
+    ]
+    for cmd in ceph_auth_cmds:
+        ceph_node.run(args=cmd)
+
+
+def distribute_ceph_keys(devstack_node, ceph_node):
+    log.info("Copying Ceph keys to DevStack node...")
+
+    def copy_key(from_remote, key_name, to_remote, dest_path, owner):
+        key_stringio = BytesIO()
+        from_remote.run(
+            args=['sudo', 'ceph', 'auth', 'get-or-create', key_name],
+            stdout=key_stringio)
+        key_stringio.seek(0)
+        to_remote.write_file(dest_path, key_stringio, owner=owner, sudo=True)
+    keys = [
+        dict(name='client.glance',
+             path='/etc/ceph/ceph.client.glance.keyring',
+             # devstack appears to just want root:root
+             #owner='glance:glance',
+             ),
+        dict(name='client.cinder',
+             path='/etc/ceph/ceph.client.cinder.keyring',
+             # devstack appears to just want root:root
+             #owner='cinder:cinder',
+             ),
+        dict(name='client.cinder-backup',
+             path='/etc/ceph/ceph.client.cinder-backup.keyring',
+             # devstack appears to just want root:root
+             #owner='cinder:cinder',
+             ),
+    ]
+    for key_dict in keys:
+        copy_key(ceph_node, key_dict['name'], devstack_node,
+                 key_dict['path'], key_dict.get('owner'))
+
+
+def set_libvirt_secret(devstack_node, ceph_node):
+    log.info("Setting libvirt secret...")
+
+    cinder_key = ceph_node.sh('sudo ceph auth get-key client.cinder').strip()
+    uuid = devstack_node.sh('uuidgen').strip()
+
+    secret_path = '/tmp/secret.xml'
+    secret_template = textwrap.dedent("""
+    <secret ephemeral='no' private='no'>
+        <uuid>{uuid}</uuid>
+        <usage type='ceph'>
+            <name>client.cinder secret</name>
+        </usage>
+    </secret>""")
+    secret_data = secret_template.format(uuid=uuid)
+    devstack_node.write_file(secret_path, secret_data)
+    devstack_node.run(args=['sudo', 'virsh', 'secret-define', '--file',
+                            secret_path])
+    devstack_node.run(args=['sudo', 'virsh', 'secret-set-value', '--secret',
+                            uuid, '--base64', cinder_key])
+    return uuid
+
+
+def update_devstack_config_files(devstack_node, secret_uuid):
+    log.info("Updating DevStack config files to use Ceph...")
+
+    def backup_config(node, file_name, backup_ext='.orig.teuth'):
+        node.run(args=['cp', '-f', file_name, file_name + backup_ext])
+
+    def update_config(config_name, config_stream, update_dict,
+                      section='DEFAULT'):
+        parser = ConfigParser()
+        parser.read_file(config_stream)
+        for (key, value) in update_dict.items():
+            parser.set(section, key, value)
+        out_stream = StringIO()
+        parser.write(out_stream)
+        out_stream.seek(0)
+        return out_stream
+
+    updates = [
+        dict(name='/etc/glance/glance-api.conf', options=dict(
+            default_store='rbd',
+            rbd_store_user='glance',
+            rbd_store_pool='images',
+            show_image_direct_url='True',)),
+        dict(name='/etc/cinder/cinder.conf', options=dict(
+            volume_driver='cinder.volume.drivers.rbd.RBDDriver',
+            rbd_pool='volumes',
+            rbd_ceph_conf='/etc/ceph/ceph.conf',
+            rbd_flatten_volume_from_snapshot='false',
+            rbd_max_clone_depth='5',
+            glance_api_version='2',
+            rbd_user='cinder',
+            rbd_secret_uuid=secret_uuid,
+            backup_driver='cinder.backup.drivers.ceph',
+            backup_ceph_conf='/etc/ceph/ceph.conf',
+            backup_ceph_user='cinder-backup',
+            backup_ceph_chunk_size='134217728',
+            backup_ceph_pool='backups',
+            backup_ceph_stripe_unit='0',
+            backup_ceph_stripe_count='0',
+            restore_discard_excess_bytes='true',
+            )),
+        dict(name='/etc/nova/nova.conf', options=dict(
+            libvirt_images_type='rbd',
+            libvirt_images_rbd_pool='volumes',
+            libvirt_images_rbd_ceph_conf='/etc/ceph/ceph.conf',
+            rbd_user='cinder',
+            rbd_secret_uuid=secret_uuid,
+            libvirt_inject_password='false',
+            libvirt_inject_key='false',
+            libvirt_inject_partition='-2',
+            )),
+    ]
+
+    for update in updates:
+        file_name = update['name']
+        options = update['options']
+        config_data = devstack_node.read_file(file_name, sudo=True)
+        config_stream = StringIO(config_data)
+        backup_config(devstack_node, file_name)
+        new_config_stream = update_config(file_name, config_stream, options)
+        devstack_node.write_file(file_name, new_config_stream, sudo=True)
+
+
+def set_apache_servername(node):
+    # Apache complains: "Could not reliably determine the server's fully
+    # qualified domain name, using 127.0.0.1 for ServerName"
+    # So, let's make sure it knows its name.
+    log.info("Setting Apache ServerName...")
+
+    hostname = node.hostname
+    config_file = '/etc/apache2/conf.d/servername'
+    config_data = "ServerName {name}".format(name=hostname)
+    node.write_file(config_file, config_data, sudo=True)
+
+
+def start_devstack(devstack_node):
+    log.info("Patching devstack start script...")
+    # This causes screen to start headless - otherwise rejoin-stack.sh fails
+    # because there is no terminal attached.
+    cmd = "cd devstack && sed -ie 's/screen -c/screen -dm -c/' rejoin-stack.sh"
+    devstack_node.run(args=cmd)
+
+    log.info("Starting devstack...")
+    cmd = "cd devstack && ./rejoin-stack.sh"
+    devstack_node.run(args=cmd)
+
+    # This was added because I was getting timeouts on Cinder requests - which
+    # were trying to access Keystone on port 5000. A more robust way to handle
+    # this would be to introduce a wait-loop on devstack_node that checks to
+    # see if a service is listening on port 5000.
+    log.info("Waiting 30s for devstack to start...")
+    time.sleep(30)
+
+
+def restart_apache(node):
+    node.run(args=['sudo', '/etc/init.d/apache2', 'restart'], wait=True)
+
+
+@contextlib.contextmanager
+def exercise(ctx, config):
+    log.info("Running devstack exercises...")
+
+    if config is None:
+        config = {}
+    if not isinstance(config, dict):
+        raise TypeError("config must be a dict")
+
+    devstack_node = next(iter(ctx.cluster.only(is_devstack_node).remotes.keys()))
+
+    # TODO: save the log *and* preserve failures
+    #devstack_archive_dir = create_devstack_archive(ctx, devstack_node)
+
+    try:
+        #cmd = "cd devstack && ./exercise.sh 2>&1 | tee {dir}/exercise.log".format(  # noqa
+        #    dir=devstack_archive_dir)
+        cmd = "cd devstack && ./exercise.sh"
+        devstack_node.run(args=cmd, wait=True)
+        yield
+    finally:
+        pass
+
+
+def create_devstack_archive(ctx, devstack_node):
+    test_dir = misc.get_testdir(ctx)
+    devstack_archive_dir = "{test_dir}/archive/devstack".format(
+        test_dir=test_dir)
+    devstack_node.run(args="mkdir -p " + devstack_archive_dir)
+    return devstack_archive_dir
+
+
+@contextlib.contextmanager
+def smoke(ctx, config):
+    log.info("Running a basic smoketest...")
+
+    devstack_node = next(iter(ctx.cluster.only(is_devstack_node).remotes.keys()))
+    an_osd_node = next(iter(ctx.cluster.only(is_osd_node).remotes.keys()))
+
+    try:
+        create_volume(devstack_node, an_osd_node, 'smoke0', 1)
+        yield
+    finally:
+        pass
+
+
+def create_volume(devstack_node, ceph_node, vol_name, size):
+    """
+    :param size: The size of the volume, in GB
+    """
+    size = str(size)
+    log.info("Creating a {size}GB volume named {name}...".format(
+        name=vol_name,
+        size=size))
+    args = ['source', 'devstack/openrc', run.Raw('&&'), 'cinder', 'create',
+            '--display-name', vol_name, size]
+    cinder_create = devstack_node.sh(args, wait=True)
+    vol_info = parse_os_table(cinder_create)
+    log.debug("Volume info: %s", str(vol_info))
+
+    try:
+        rbd_output = ceph_node.sh("rbd --id cinder ls -l volumes", wait=True)
+    except run.CommandFailedError:
+        log.debug("Original rbd call failed; retrying without '--id cinder'")
+        rbd_output = ceph_node.sh("rbd ls -l volumes", wait=True)
+
+    assert vol_info['id'] in rbd_output, \
+        "Volume not found on Ceph cluster"
+    assert vol_info['size'] == size, \
+        "Volume size on Ceph cluster is different than specified"
+    return vol_info['id']
+
+
+def parse_os_table(table_str):
+    out_dict = dict()
+    for line in table_str.split('\n'):
+        if line.startswith('|'):
+            items = line.split()
+            out_dict[items[1]] = items[3]
+    return out_dict
diff --git a/qa/tasks/die_on_err.py b/qa/tasks/die_on_err.py
new file mode 100644
index 000000000..a6aa4c632
--- /dev/null
+++ b/qa/tasks/die_on_err.py
@@ -0,0 +1,70 @@
+"""
+Raise exceptions on osd coredumps or test err directories
+"""
+import contextlib
+import logging
+import time
+from teuthology.orchestra import run
+
+from tasks import ceph_manager
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Die if {testdir}/err exists or if an OSD dumps core
+    """
+    if config is None:
+        config = {}
+
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+
+    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+    log.info('num_osds is %s' % num_osds)
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    while len(manager.get_osd_status()['up']) < num_osds:
+        time.sleep(10)
+
+    testdir = teuthology.get_testdir(ctx)
+
+    while True:
+        for i in range(num_osds):
+            (osd_remote,) = ctx.cluster.only('osd.%d' % i).remotes.keys()
+            p = osd_remote.run(
+                args = [ 'test', '-e', '{tdir}/err'.format(tdir=testdir) ],
+                wait=True,
+                check_status=False,
+            )
+            exit_status = p.exitstatus
+
+            if exit_status == 0:
+                log.info("osd %d has an error" % i)
+                raise Exception("osd %d error" % i)
+
+            log_path = '/var/log/ceph/osd.%d.log' % (i)
+
+            p = osd_remote.run(
+                args = [
+                         'tail', '-1', log_path,
+                         run.Raw('|'),
+                         'grep', '-q', 'end dump'
+                       ],
+                wait=True,
+                check_status=False,
+            )
+            exit_status = p.exitstatus
+
+            if exit_status == 0:
+                log.info("osd %d dumped core" % i)
+                raise Exception("osd %d dumped core" % i)
+
+        time.sleep(5)
diff --git a/qa/tasks/divergent_priors.py b/qa/tasks/divergent_priors.py
new file mode 100644
index 000000000..e000bb2bb
--- /dev/null
+++ b/qa/tasks/divergent_priors.py
@@ -0,0 +1,160 @@
+"""
+Special case divergence test
+"""
+import logging
+import time
+
+from teuthology import misc as teuthology
+from tasks.util.rados import rados
+
+
+log = logging.getLogger(__name__)
+
+
+def task(ctx, config):
+    """
+    Test handling of divergent entries with prior_version
+    prior to log_tail
+
+    overrides:
+      ceph:
+        conf:
+          osd:
+            debug osd: 5
+
+    Requires 3 osds on a single test node.
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'divergent_priors task only accepts a dict for configuration'
+
+    manager = ctx.managers['ceph']
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+    manager.flush_pg_stats([0, 1, 2])
+    manager.raw_cluster_cmd('osd', 'set', 'noout')
+    manager.raw_cluster_cmd('osd', 'set', 'noin')
+    manager.raw_cluster_cmd('osd', 'set', 'nodown')
+    manager.wait_for_clean()
+
+    # something that is always there
+    dummyfile = '/etc/fstab'
+    dummyfile2 = '/etc/resolv.conf'
+
+    # create 1 pg pool
+    log.info('creating foo')
+    manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1')
+
+    osds = [0, 1, 2]
+    for i in osds:
+        manager.set_config(i, osd_min_pg_log_entries=10)
+        manager.set_config(i, osd_max_pg_log_entries=10)
+        manager.set_config(i, osd_pg_log_trim_min=5)
+
+    # determine primary
+    divergent = manager.get_pg_primary('foo', 0)
+    log.info("primary and soon to be divergent is %d", divergent)
+    non_divergent = list(osds)
+    non_divergent.remove(divergent)
+
+    log.info('writing initial objects')
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+    # write 100 objects
+    for i in range(100):
+        rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile])
+
+    manager.wait_for_clean()
+
+    # blackhole non_divergent
+    log.info("blackholing osds %s", str(non_divergent))
+    for i in non_divergent:
+        manager.set_config(i, objectstore_blackhole=1)
+
+    DIVERGENT_WRITE = 5
+    DIVERGENT_REMOVE = 5
+    # Write some soon to be divergent
+    log.info('writing divergent objects')
+    for i in range(DIVERGENT_WRITE):
+        rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i,
+                         dummyfile2], wait=False)
+    # Remove some soon to be divergent
+    log.info('remove divergent objects')
+    for i in range(DIVERGENT_REMOVE):
+        rados(ctx, mon, ['-p', 'foo', 'rm',
+                         'existing_%d' % (i + DIVERGENT_WRITE)], wait=False)
+    time.sleep(10)
+    mon.run(
+        args=['killall', '-9', 'rados'],
+        wait=True,
+        check_status=False)
+
+    # kill all the osds but leave divergent in
+    log.info('killing all the osds')
+    for i in osds:
+        manager.kill_osd(i)
+    for i in osds:
+        manager.mark_down_osd(i)
+    for i in non_divergent:
+        manager.mark_out_osd(i)
+
+    # bring up non-divergent
+    log.info("bringing up non_divergent %s", str(non_divergent))
+    for i in non_divergent:
+        manager.revive_osd(i)
+    for i in non_divergent:
+        manager.mark_in_osd(i)
+
+    # write 1 non-divergent object (ensure that old divergent one is divergent)
+    objname = "existing_%d" % (DIVERGENT_WRITE + DIVERGENT_REMOVE)
+    log.info('writing non-divergent object ' + objname)
+    rados(ctx, mon, ['-p', 'foo', 'put', objname, dummyfile2])
+
+    manager.wait_for_recovery()
+
+    # ensure no recovery of up osds first
+    log.info('delay recovery')
+    for i in non_divergent:
+        manager.wait_run_admin_socket(
+            'osd', i, ['set_recovery_delay', '100000'])
+
+    # bring in our divergent friend
+    log.info("revive divergent %d", divergent)
+    manager.raw_cluster_cmd('osd', 'set', 'noup')
+    manager.revive_osd(divergent)
+
+    log.info('delay recovery divergent')
+    manager.wait_run_admin_socket(
+        'osd', divergent, ['set_recovery_delay', '100000'])
+
+    manager.raw_cluster_cmd('osd', 'unset', 'noup')
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+
+    log.info('wait for peering')
+    rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
+
+    # At this point the divergent_priors should have been detected
+
+    log.info("killing divergent %d", divergent)
+    manager.kill_osd(divergent)
+    log.info("reviving divergent %d", divergent)
+    manager.revive_osd(divergent)
+
+    time.sleep(20)
+
+    log.info('allowing recovery')
+    # Set osd_recovery_delay_start back to 0 and kick the queue
+    for i in osds:
+        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'debug',
+                                    'kick_recovery_wq', ' 0')
+
+    log.info('reading divergent objects')
+    for i in range(DIVERGENT_WRITE + DIVERGENT_REMOVE):
+        exit_status = rados(ctx, mon, ['-p', 'foo', 'get', 'existing_%d' % i,
+                                       '/tmp/existing'])
+        assert exit_status == 0
+
+    log.info("success")
diff --git a/qa/tasks/divergent_priors2.py b/qa/tasks/divergent_priors2.py
new file mode 100644
index 000000000..4d4b07fc4
--- /dev/null
+++ b/qa/tasks/divergent_priors2.py
@@ -0,0 +1,192 @@
+"""
+Special case divergence test with ceph-objectstore-tool export/remove/import
+"""
+import logging
+import time
+
+from teuthology.exceptions import CommandFailedError
+from teuthology import misc as teuthology
+from tasks.util.rados import rados
+import os
+
+
+log = logging.getLogger(__name__)
+
+
+def task(ctx, config):
+    """
+    Test handling of divergent entries with prior_version
+    prior to log_tail and a ceph-objectstore-tool export/import
+
+    overrides:
+      ceph:
+        conf:
+          osd:
+            debug osd: 5
+
+    Requires 3 osds on a single test node.
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'divergent_priors task only accepts a dict for configuration'
+
+    manager = ctx.managers['ceph']
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+    manager.flush_pg_stats([0, 1, 2])
+    manager.raw_cluster_cmd('osd', 'set', 'noout')
+    manager.raw_cluster_cmd('osd', 'set', 'noin')
+    manager.raw_cluster_cmd('osd', 'set', 'nodown')
+    manager.wait_for_clean()
+
+    # something that is always there
+    dummyfile = '/etc/fstab'
+    dummyfile2 = '/etc/resolv.conf'
+    testdir = teuthology.get_testdir(ctx)
+
+    # create 1 pg pool
+    log.info('creating foo')
+    manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1')
+
+    osds = [0, 1, 2]
+    for i in osds:
+        manager.set_config(i, osd_min_pg_log_entries=10)
+        manager.set_config(i, osd_max_pg_log_entries=10)
+        manager.set_config(i, osd_pg_log_trim_min=5)
+
+    # determine primary
+    divergent = manager.get_pg_primary('foo', 0)
+    log.info("primary and soon to be divergent is %d", divergent)
+    non_divergent = list(osds)
+    non_divergent.remove(divergent)
+
+    log.info('writing initial objects')
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+    # write 100 objects
+    for i in range(100):
+        rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile])
+
+    manager.wait_for_clean()
+
+    # blackhole non_divergent
+    log.info("blackholing osds %s", str(non_divergent))
+    for i in non_divergent:
+        manager.set_config(i, objectstore_blackhole=1)
+
+    DIVERGENT_WRITE = 5
+    DIVERGENT_REMOVE = 5
+    # Write some soon to be divergent
+    log.info('writing divergent objects')
+    for i in range(DIVERGENT_WRITE):
+        rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i,
+                         dummyfile2], wait=False)
+    # Remove some soon to be divergent
+    log.info('remove divergent objects')
+    for i in range(DIVERGENT_REMOVE):
+        rados(ctx, mon, ['-p', 'foo', 'rm',
+                         'existing_%d' % (i + DIVERGENT_WRITE)], wait=False)
+    time.sleep(10)
+    mon.run(
+        args=['killall', '-9', 'rados'],
+        wait=True,
+        check_status=False)
+
+    # kill all the osds but leave divergent in
+    log.info('killing all the osds')
+    for i in osds:
+        manager.kill_osd(i)
+    for i in osds:
+        manager.mark_down_osd(i)
+    for i in non_divergent:
+        manager.mark_out_osd(i)
+
+    # bring up non-divergent
+    log.info("bringing up non_divergent %s", str(non_divergent))
+    for i in non_divergent:
+        manager.revive_osd(i)
+    for i in non_divergent:
+        manager.mark_in_osd(i)
+
+    # write 1 non-divergent object (ensure that old divergent one is divergent)
+    objname = "existing_%d" % (DIVERGENT_WRITE + DIVERGENT_REMOVE)
+    log.info('writing non-divergent object ' + objname)
+    rados(ctx, mon, ['-p', 'foo', 'put', objname, dummyfile2])
+
+    manager.wait_for_recovery()
+
+    # ensure no recovery of up osds first
+    log.info('delay recovery')
+    for i in non_divergent:
+        manager.wait_run_admin_socket(
+            'osd', i, ['set_recovery_delay', '100000'])
+
+    # bring in our divergent friend
+    log.info("revive divergent %d", divergent)
+    manager.raw_cluster_cmd('osd', 'set', 'noup')
+    manager.revive_osd(divergent)
+
+    log.info('delay recovery divergent')
+    manager.wait_run_admin_socket(
+        'osd', divergent, ['set_recovery_delay', '100000'])
+
+    manager.raw_cluster_cmd('osd', 'unset', 'noup')
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+
+    log.info('wait for peering')
+    rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
+
+    # At this point the divergent_priors should have been detected
+
+    log.info("killing divergent %d", divergent)
+    manager.kill_osd(divergent)
+
+    # Export a pg
+    (exp_remote,) = ctx.\
+        cluster.only('osd.{o}'.format(o=divergent)).remotes.keys()
+    FSPATH = manager.get_filepath()
+    JPATH = os.path.join(FSPATH, "journal")
+    prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
+              "--data-path {fpath} --journal-path {jpath} "
+              "--log-file="
+              "/var/log/ceph/objectstore_tool.$$.log ".
+              format(fpath=FSPATH, jpath=JPATH))
+    pid = os.getpid()
+    expfile = os.path.join(testdir, "exp.{pid}.out".format(pid=pid))
+    cmd = ((prefix + "--op export-remove --pgid 2.0 --file {file}").
+           format(id=divergent, file=expfile))
+    try:
+        exp_remote.sh(cmd, wait=True)
+    except CommandFailedError as e:
+        assert e.exitstatus == 0
+
+    cmd = ((prefix + "--op import --file {file}").
+           format(id=divergent, file=expfile))
+    try:
+        exp_remote.sh(cmd, wait=True)
+    except CommandFailedError as e:
+        assert e.exitstatus == 0
+
+    log.info("reviving divergent %d", divergent)
+    manager.revive_osd(divergent)
+    manager.wait_run_admin_socket('osd', divergent, ['dump_ops_in_flight'])
+    time.sleep(20);
+
+    log.info('allowing recovery')
+    # Set osd_recovery_delay_start back to 0 and kick the queue
+    for i in osds:
+        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'debug',
+                                    'kick_recovery_wq', ' 0')
+
+    log.info('reading divergent objects')
+    for i in range(DIVERGENT_WRITE + DIVERGENT_REMOVE):
+        exit_status = rados(ctx, mon, ['-p', 'foo', 'get', 'existing_%d' % i,
+                                       '/tmp/existing'])
+        assert exit_status == 0
+
+    cmd = 'rm {file}'.format(file=expfile)
+    exp_remote.run(args=cmd, wait=True)
+    log.info("success")
diff --git a/qa/tasks/dnsmasq.py b/qa/tasks/dnsmasq.py
new file mode 100644
index 000000000..df8ccecb1
--- /dev/null
+++ b/qa/tasks/dnsmasq.py
@@ -0,0 +1,170 @@
+"""
+Task for dnsmasq configuration
+"""
+import contextlib
+import logging
+
+from teuthology import misc
+from teuthology.exceptions import ConfigError
+from teuthology import contextutil
+from teuthology import packaging
+from tasks.util import get_remote_for_role
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def install_dnsmasq(remote):
+    """
+    If dnsmasq is not installed, install it for the duration of the task.
+    """
+    try:
+        existing = packaging.get_package_version(remote, 'dnsmasq')
+    except:
+        existing = None
+
+    if existing is None:
+        packaging.install_package('dnsmasq', remote)
+    try:
+        yield
+    finally:
+        if existing is None:
+            packaging.remove_package('dnsmasq', remote)
+
+@contextlib.contextmanager
+def backup_resolv(remote, path):
+    """
+    Store a backup of resolv.conf in the testdir and restore it after the task.
+    """
+    remote.run(args=['cp', '/etc/resolv.conf', path])
+    try:
+        yield
+    finally:
+        # restore with 'cp' to avoid overwriting its security context
+        remote.run(args=['sudo', 'cp', path, '/etc/resolv.conf'])
+        remote.run(args=['rm', path])
+
+@contextlib.contextmanager
+def replace_resolv(remote, path):
+    """
+    Update resolv.conf to point the nameserver at localhost.
+    """
+    remote.write_file(path, "nameserver 127.0.0.1\n")
+    try:
+        # install it
+        if remote.os.package_type == "rpm":
+            # for centos ovh resolv.conf has immutable attribute set
+            remote.run(args=['sudo', 'chattr', '-i', '/etc/resolv.conf'], check_status=False)
+        remote.run(args=['sudo', 'cp', path, '/etc/resolv.conf'])
+        yield
+    finally:
+        remote.run(args=['rm', path])
+
+@contextlib.contextmanager
+def setup_dnsmasq(remote, testdir, cnames):
+    """ configure dnsmasq on the given remote, adding each cname given """
+    log.info('Configuring dnsmasq on remote %s..', remote.name)
+
+    # add address entries for each cname
+    dnsmasq = "server=8.8.8.8\nserver=8.8.4.4\n"
+    address_template = "address=/{cname}/{ip_address}\n"
+    for cname, ip_address in cnames.items():
+        dnsmasq += address_template.format(cname=cname, ip_address=ip_address)
+
+    # write to temporary dnsmasq file
+    dnsmasq_tmp = '/'.join((testdir, 'ceph.tmp'))
+    remote.write_file(dnsmasq_tmp, dnsmasq)
+
+    # move into /etc/dnsmasq.d/
+    dnsmasq_path = '/etc/dnsmasq.d/ceph'
+    remote.run(args=['sudo', 'mv', dnsmasq_tmp, dnsmasq_path])
+    # restore selinux context if necessary
+    remote.run(args=['sudo', 'restorecon', dnsmasq_path], check_status=False)
+
+    # restart dnsmasq
+    remote.run(args=['sudo', 'systemctl', 'restart', 'dnsmasq'])
+    # verify dns name is set
+    remote.run(args=['ping', '-c', '4', next(iter(cnames.keys()))])
+
+    try:
+        yield
+    finally:
+        log.info('Removing dnsmasq configuration from remote %s..', remote.name)
+        # remove /etc/dnsmasq.d/ceph
+        remote.run(args=['sudo', 'rm', dnsmasq_path])
+        # restart dnsmasq
+        remote.run(args=['sudo', 'systemctl', 'restart', 'dnsmasq'])
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Configures dnsmasq to add cnames for teuthology remotes. The task expects a
+    dictionary, where each key is a role. If all cnames for that role use the
+    same address as that role, the cnames can be given as a list. For example,
+    this entry configures dnsmasq on the remote associated with client.0, adding
+    two cnames for the ip address associated with client.0:
+
+        - dnsmasq:
+            client.0:
+            - client0.example.com
+            - c0.example.com
+
+    If the addresses do not all match the given role, a dictionary can be given
+    to specify the ip address by its target role. For example:
+
+        - dnsmasq:
+            client.0:
+              client.0.example.com: client.0
+              client.1.example.com: client.1
+
+    Cnames that end with a . are treated as prefix for the existing hostname.
+    For example, if the remote for client.0 has a hostname of 'example.com',
+    this task will add cnames for dev.example.com and test.example.com:
+
+        - dnsmasq:
+            client.0: [dev., test.]
+    """
+    # apply overrides
+    overrides = config.get('overrides', {})
+    misc.deep_merge(config, overrides.get('dnsmasq', {}))
+
+    # multiple roles may map to the same remote, so collect names by remote
+    remote_names = {}
+    for role, cnames in config.items():
+        remote = get_remote_for_role(ctx, role)
+        if remote is None:
+            raise ConfigError('no remote for role %s' % role)
+
+        names = remote_names.get(remote, {})
+
+        if isinstance(cnames, list):
+            # when given a list of cnames, point to local ip
+            for cname in cnames:
+                if cname.endswith('.'):
+                    cname += remote.hostname
+                names[cname] = remote.ip_address
+        elif isinstance(cnames, dict):
+            # when given a dict, look up the remote ip for each
+            for cname, client in cnames.items():
+                r = get_remote_for_role(ctx, client)
+                if r is None:
+                    raise ConfigError('no remote for role %s' % client)
+                if cname.endswith('.'):
+                    cname += r.hostname
+                names[cname] = r.ip_address
+
+        remote_names[remote] = names
+
+    testdir = misc.get_testdir(ctx)
+    resolv_bak = '/'.join((testdir, 'resolv.bak'))
+    resolv_tmp = '/'.join((testdir, 'resolv.tmp'))
+
+    # run subtasks for each unique remote
+    subtasks = []
+    for remote, cnames in remote_names.items():
+        subtasks.extend([ lambda r=remote: install_dnsmasq(r) ])
+        subtasks.extend([ lambda r=remote: backup_resolv(r, resolv_bak) ])
+        subtasks.extend([ lambda r=remote: replace_resolv(r, resolv_tmp) ])
+        subtasks.extend([ lambda r=remote, cn=cnames: setup_dnsmasq(r, testdir, cn) ])
+
+    with contextutil.nested(*subtasks):
+        yield
diff --git a/qa/tasks/dump_stuck.py b/qa/tasks/dump_stuck.py
new file mode 100644
index 000000000..4971f1916
--- /dev/null
+++ b/qa/tasks/dump_stuck.py
@@ -0,0 +1,161 @@
+"""
+Dump_stuck command
+"""
+import logging
+import time
+
+from tasks import ceph_manager
+from teuthology import misc as teuthology
+
+
+log = logging.getLogger(__name__)
+
+def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10):
+    """
+    Do checks.  Make sure get_stuck_pgs return the right amount of information, then
+    extract health information from the raw_cluster_cmd and compare the results with
+    values passed in.  This passes if all asserts pass.
+ 
+    :param num_manager: Ceph manager
+    :param num_inactive: number of inaactive pages that are stuck
+    :param num_unclean: number of unclean pages that are stuck
+    :param num_stale: number of stale pages that are stuck
+    :param timeout: timeout value for get_stuck_pgs calls
+    """
+    inactive = manager.get_stuck_pgs('inactive', timeout)
+    unclean = manager.get_stuck_pgs('unclean', timeout)
+    stale = manager.get_stuck_pgs('stale', timeout)
+    log.info('inactive %s / %d,  unclean %s / %d,  stale %s / %d',
+             len(inactive), num_inactive,
+             len(unclean), num_unclean,
+             len(stale), num_stale)
+    assert len(inactive) == num_inactive
+    assert len(unclean) == num_unclean
+    assert len(stale) == num_stale
+
+def task(ctx, config):
+    """
+    Test the dump_stuck command.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    assert config is None, \
+        'dump_stuck requires no configuration'
+    assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
+        'dump_stuck requires exactly 2 osds'
+
+    timeout = 60
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    manager.flush_pg_stats([0, 1])
+    manager.wait_for_clean(timeout)
+
+    manager.raw_cluster_cmd('tell', 'mon.a', 'injectargs', '--',
+#                            '--mon-osd-report-timeout 90',
+                            '--mon-pg-stuck-threshold 10')
+
+    # all active+clean
+    check_stuck(
+        manager,
+        num_inactive=0,
+        num_unclean=0,
+        num_stale=0,
+        )
+    num_pgs = manager.get_num_pgs()
+
+    manager.mark_out_osd(0)
+    time.sleep(timeout)
+    manager.flush_pg_stats([1])
+    manager.wait_for_recovery(timeout)
+
+    # all active+clean+remapped
+    check_stuck(
+        manager,
+        num_inactive=0,
+        num_unclean=0,
+        num_stale=0,
+        )
+
+    manager.mark_in_osd(0)
+    manager.flush_pg_stats([0, 1])
+    manager.wait_for_clean(timeout)
+
+    # all active+clean
+    check_stuck(
+        manager,
+        num_inactive=0,
+        num_unclean=0,
+        num_stale=0,
+        )
+
+    log.info('stopping first osd')
+    manager.kill_osd(0)
+    manager.mark_down_osd(0)
+    manager.wait_for_active(timeout)
+
+    log.info('waiting for all to be unclean')
+    starttime = time.time()
+    done = False
+    while not done:
+        try:
+            check_stuck(
+                manager,
+                num_inactive=0,
+                num_unclean=num_pgs,
+                num_stale=0,
+                )
+            done = True
+        except AssertionError:
+            # wait up to 15 minutes to become stale
+            if time.time() - starttime > 900:
+                raise
+
+
+    log.info('stopping second osd')
+    manager.kill_osd(1)
+    manager.mark_down_osd(1)
+
+    log.info('waiting for all to be stale')
+    starttime = time.time()
+    done = False
+    while not done:
+        try:
+            check_stuck(
+                manager,
+                num_inactive=0,
+                num_unclean=num_pgs,
+                num_stale=num_pgs,
+                )
+            done = True
+        except AssertionError:
+            # wait up to 15 minutes to become stale
+            if time.time() - starttime > 900:
+                raise
+
+    log.info('reviving')
+    for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
+        manager.revive_osd(id_)
+        manager.mark_in_osd(id_)
+    while True:
+        try:
+            manager.flush_pg_stats([0, 1])
+            break
+        except Exception:
+            log.exception('osds must not be started yet, waiting...')
+            time.sleep(1)
+    manager.wait_for_clean(timeout)
+
+    check_stuck(
+        manager,
+        num_inactive=0,
+        num_unclean=0,
+        num_stale=0,
+        )
diff --git a/qa/tasks/ec_inconsistent_hinfo.py b/qa/tasks/ec_inconsistent_hinfo.py
new file mode 100644
index 000000000..fa10f2c45
--- /dev/null
+++ b/qa/tasks/ec_inconsistent_hinfo.py
@@ -0,0 +1,225 @@
+"""
+Inconsistent_hinfo
+"""
+import logging
+import time
+from dateutil.parser import parse
+from tasks import ceph_manager
+from tasks.util.rados import rados
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+def wait_for_deep_scrub_complete(manager, pgid, check_time_now, inconsistent):
+    log.debug("waiting for pg %s deep-scrub complete (check_time_now=%s)" %
+              (pgid, check_time_now))
+    for i in range(300):
+        time.sleep(5)
+        manager.flush_pg_stats([0, 1, 2, 3])
+        pgs = manager.get_pg_stats()
+        pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
+        log.debug('pg=%s' % pg);
+        assert pg
+
+        last_deep_scrub_time = parse(pg['last_deep_scrub_stamp']).strftime('%s')
+        if last_deep_scrub_time < check_time_now:
+            log.debug('not scrubbed')
+            continue
+
+        status = pg['state'].split('+')
+        if inconsistent:
+            assert 'inconsistent' in status
+        else:
+            assert 'inconsistent' not in status
+        return
+
+    assert False, 'not scrubbed'
+
+
+def wait_for_backfilling_complete(manager, pgid, from_osd, to_osd):
+    log.debug("waiting for pg %s backfill from osd.%s to osd.%s complete" %
+              (pgid, from_osd, to_osd))
+    for i in range(300):
+        time.sleep(5)
+        manager.flush_pg_stats([0, 1, 2, 3])
+        pgs = manager.get_pg_stats()
+        pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
+        log.info('pg=%s' % pg);
+        assert pg
+        status = pg['state'].split('+')
+        if 'active' not in status:
+            log.debug('not active')
+            continue
+        if 'backfilling' in status:
+            assert from_osd in pg['acting'] and to_osd in pg['up']
+            log.debug('backfilling')
+            continue
+        if to_osd not in pg['up']:
+            log.debug('backfill not started yet')
+            continue
+        log.debug('backfilled!')
+        break
+
+def task(ctx, config):
+    """
+    Test handling of objects with inconsistent hash info during backfill and deep-scrub.
+
+    A pretty rigid cluster is brought up and tested by this task
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'ec_inconsistent_hinfo task only accepts a dict for configuration'
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    profile = config.get('erasure_code_profile', {
+        'k': '2',
+        'm': '1',
+        'crush-failure-domain': 'osd'
+    })
+    profile_name = profile.get('name', 'backfill_unfound')
+    manager.create_erasure_code_profile(profile_name, profile)
+    pool = manager.create_pool_with_unique_name(
+        pg_num=1,
+        erasure_code_profile_name=profile_name,
+        min_size=2)
+    manager.raw_cluster_cmd('osd', 'pool', 'set', pool,
+                            'pg_autoscale_mode', 'off')
+
+    manager.flush_pg_stats([0, 1, 2, 3])
+    manager.wait_for_clean()
+
+    pool_id = manager.get_pool_num(pool)
+    pgid = '%d.0' % pool_id
+    pgs = manager.get_pg_stats()
+    acting = next((pg['acting'] for pg in pgs if pg['pgid'] == pgid), None)
+    log.info("acting=%s" % acting)
+    assert acting
+    primary = acting[0]
+
+    # something that is always there, readable and never empty
+    dummyfile = '/etc/group'
+
+    # kludge to make sure they get a map
+    rados(ctx, mon, ['-p', pool, 'put', 'dummy', dummyfile])
+
+    manager.flush_pg_stats([0, 1])
+    manager.wait_for_recovery()
+
+    log.debug("create test object")
+    obj = 'test'
+    rados(ctx, mon, ['-p', pool, 'put', obj, dummyfile])
+
+    victim = acting[1]
+
+    log.info("remove test object hash info from osd.%s shard and test deep-scrub and repair"
+             % victim)
+
+    manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
+                             object_name=obj, osd=victim)
+    check_time_now = time.strftime('%s')
+    manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
+    wait_for_deep_scrub_complete(manager, pgid, check_time_now, True)
+
+    check_time_now = time.strftime('%s')
+    manager.raw_cluster_cmd('pg', 'repair', pgid)
+    wait_for_deep_scrub_complete(manager, pgid, check_time_now, False)
+
+    log.info("remove test object hash info from primary osd.%s shard and test backfill"
+             % primary)
+
+    log.debug("write some data")
+    rados(ctx, mon, ['-p', pool, 'bench', '30', 'write', '-b', '4096',
+                     '--no-cleanup'])
+
+    manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
+                             object_name=obj, osd=primary)
+
+    # mark the osd out to trigger a rebalance/backfill
+    source = acting[1]
+    target = [x for x in [0, 1, 2, 3] if x not in acting][0]
+    manager.mark_out_osd(source)
+
+    # wait for everything to peer, backfill and recover
+    wait_for_backfilling_complete(manager, pgid, source, target)
+    manager.wait_for_clean()
+
+    manager.flush_pg_stats([0, 1, 2, 3])
+    pgs = manager.get_pg_stats()
+    pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
+    log.debug('pg=%s' % pg)
+    assert pg
+    assert 'clean' in pg['state'].split('+')
+    assert 'inconsistent' not in pg['state'].split('+')
+    unfound = manager.get_num_unfound_objects()
+    log.debug("there are %d unfound objects" % unfound)
+    assert unfound == 0
+
+    source, target = target, source
+    log.info("remove test object hash info from non-primary osd.%s shard and test backfill"
+             % source)
+
+    manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
+                             object_name=obj, osd=source)
+
+    # mark the osd in to trigger a rebalance/backfill
+    manager.mark_in_osd(target)
+
+    # wait for everything to peer, backfill and recover
+    wait_for_backfilling_complete(manager, pgid, source, target)
+    manager.wait_for_clean()
+
+    manager.flush_pg_stats([0, 1, 2, 3])
+    pgs = manager.get_pg_stats()
+    pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
+    log.debug('pg=%s' % pg)
+    assert pg
+    assert 'clean' in pg['state'].split('+')
+    assert 'inconsistent' not in pg['state'].split('+')
+    unfound = manager.get_num_unfound_objects()
+    log.debug("there are %d unfound objects" % unfound)
+    assert unfound == 0
+
+    log.info("remove hash info from two shards and test backfill")
+
+    source = acting[2]
+    target = [x for x in [0, 1, 2, 3] if x not in acting][0]
+    manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
+                             object_name=obj, osd=primary)
+    manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
+                             object_name=obj, osd=source)
+
+    # mark the osd out to trigger a rebalance/backfill
+    manager.mark_out_osd(source)
+
+    # wait for everything to peer, backfill and detect unfound object
+    wait_for_backfilling_complete(manager, pgid, source, target)
+
+    # verify that there is unfound object
+    manager.flush_pg_stats([0, 1, 2, 3])
+    pgs = manager.get_pg_stats()
+    pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
+    log.debug('pg=%s' % pg)
+    assert pg
+    assert 'backfill_unfound' in pg['state'].split('+')
+    unfound = manager.get_num_unfound_objects()
+    log.debug("there are %d unfound objects" % unfound)
+    assert unfound == 1
+    m = manager.list_pg_unfound(pgid)
+    log.debug('list_pg_unfound=%s' % m)
+    assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound']
+
+    # mark stuff lost
+    pgs = manager.get_pg_stats()
+    manager.raw_cluster_cmd('pg', pgid, 'mark_unfound_lost', 'delete')
+
+    # wait for everything to peer and be happy...
+    manager.flush_pg_stats([0, 1, 2, 3])
+    manager.wait_for_recovery()
diff --git a/qa/tasks/ec_lost_unfound.py b/qa/tasks/ec_lost_unfound.py
new file mode 100644
index 000000000..57a9364ec
--- /dev/null
+++ b/qa/tasks/ec_lost_unfound.py
@@ -0,0 +1,159 @@
+"""
+Lost_unfound
+"""
+import logging
+import time
+from tasks import ceph_manager
+from tasks.util.rados import rados
+from teuthology import misc as teuthology
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Test handling of lost objects on an ec pool.
+
+    A pretty rigid cluster is brought up and tested by this task
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'lost_unfound task only accepts a dict for configuration'
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    manager.wait_for_clean()
+
+    profile = config.get('erasure_code_profile', {
+        'k': '2',
+        'm': '2',
+        'crush-failure-domain': 'osd'
+    })
+    profile_name = profile.get('name', 'lost_unfound')
+    manager.create_erasure_code_profile(profile_name, profile)
+    pool = manager.create_pool_with_unique_name(
+        erasure_code_profile_name=profile_name,
+        min_size=2)
+
+    # something that is always there, readable and never empty
+    dummyfile = '/etc/group'
+
+    # kludge to make sure they get a map
+    rados(ctx, mon, ['-p', pool, 'put', 'dummy', dummyfile])
+
+    manager.flush_pg_stats([0, 1])
+    manager.wait_for_recovery()
+
+    # create old objects
+    for f in range(1, 10):
+        rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', pool, 'rm', 'existed_%d' % f])
+
+    # delay recovery, and make the pg log very long (to prevent backfill)
+    manager.raw_cluster_cmd(
+            'tell', 'osd.1',
+            'injectargs',
+            '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000'
+            )
+
+    manager.kill_osd(0)
+    manager.mark_down_osd(0)
+    manager.kill_osd(3)
+    manager.mark_down_osd(3)
+    
+    for f in range(1, 10):
+        rados(ctx, mon, ['-p', pool, 'put', 'new_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile])
+
+    # take out osd.1 and a necessary shard of those objects.
+    manager.kill_osd(1)
+    manager.mark_down_osd(1)
+    manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it')
+    manager.revive_osd(0)
+    manager.wait_till_osd_is_up(0)
+    manager.revive_osd(3)
+    manager.wait_till_osd_is_up(3)
+
+    manager.flush_pg_stats([0, 2, 3])
+    manager.wait_till_active()
+    manager.flush_pg_stats([0, 2, 3])
+
+    # verify that there are unfound objects
+    unfound = manager.get_num_unfound_objects()
+    log.info("there are %d unfound objects" % unfound)
+    assert unfound
+
+    testdir = teuthology.get_testdir(ctx)
+    procs = []
+    if config.get('parallel_bench', True):
+        procs.append(mon.run(
+            args=[
+                "/bin/sh", "-c",
+                " ".join(['adjust-ulimits',
+                          'ceph-coverage',
+                          '{tdir}/archive/coverage',
+                          'rados',
+                          '--no-log-to-stderr',
+                          '--name', 'client.admin',
+                          '-b', str(4<<10),
+                          '-p' , pool,
+                          '-t', '20',
+                          'bench', '240', 'write',
+                      ]).format(tdir=testdir),
+            ],
+            logger=log.getChild('radosbench.{id}'.format(id='client.admin')),
+            stdin=run.PIPE,
+            wait=False
+        ))
+    time.sleep(10)
+
+    # mark stuff lost
+    pgs = manager.get_pg_stats()
+    for pg in pgs:
+        if pg['stat_sum']['num_objects_unfound'] > 0:
+            # verify that i can list them direct from the osd
+            log.info('listing missing/lost in %s state %s', pg['pgid'],
+                     pg['state']);
+            m = manager.list_pg_unfound(pg['pgid'])
+            log.info('%s' % m)
+            assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound']
+
+            log.info("reverting unfound in %s", pg['pgid'])
+            manager.raw_cluster_cmd('pg', pg['pgid'],
+                                    'mark_unfound_lost', 'delete')
+        else:
+            log.info("no unfound in %s", pg['pgid'])
+
+    manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5')
+    manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5')
+    manager.raw_cluster_cmd('tell', 'osd.3', 'debug', 'kick_recovery_wq', '5')
+    manager.flush_pg_stats([0, 2, 3])
+    manager.wait_for_recovery()
+
+    if not config.get('parallel_bench', True):
+        time.sleep(20)
+
+    # verify result
+    for f in range(1, 10):
+        err = rados(ctx, mon, ['-p', pool, 'get', 'new_%d' % f, '-'])
+        assert err
+        err = rados(ctx, mon, ['-p', pool, 'get', 'existed_%d' % f, '-'])
+        assert err
+        err = rados(ctx, mon, ['-p', pool, 'get', 'existing_%d' % f, '-'])
+        assert err
+
+    # see if osd.1 can cope
+    manager.revive_osd(1)
+    manager.wait_till_osd_is_up(1)
+    manager.wait_for_clean()
+    run.wait(procs)
+    manager.wait_for_clean()
diff --git a/qa/tasks/exec_on_cleanup.py b/qa/tasks/exec_on_cleanup.py
new file mode 100644
index 000000000..5a630781a
--- /dev/null
+++ b/qa/tasks/exec_on_cleanup.py
@@ -0,0 +1,61 @@
+"""
+Exececute custom commands during unwind/cleanup
+"""
+import logging
+import contextlib
+
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Execute commands on a given role
+
+        tasks:
+        - ceph:
+        - kclient: [client.a]
+        - exec:
+            client.a:
+              - "echo 'module libceph +p' > /sys/kernel/debug/dynamic_debug/control"
+              - "echo 'module ceph +p' > /sys/kernel/debug/dynamic_debug/control"
+        - interactive:
+
+    It stops and fails with the first command that does not return on success. It means
+    that if the first command fails, the second won't run at all.
+
+    To avoid confusion it is recommended to explicitly enclose the commands in 
+    double quotes. For instance if the command is false (without double quotes) it will
+    be interpreted as a boolean by the YAML parser.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    try:
+        yield
+    finally:
+        log.info('Executing custom commands...')
+        assert isinstance(config, dict), "task exec got invalid config"
+
+        testdir = teuthology.get_testdir(ctx)
+
+        if 'all' in config and len(config) == 1:
+            a = config['all']
+            roles = teuthology.all_roles(ctx.cluster)
+            config = dict((id_, a) for id_ in roles)
+
+        for role, ls in config.items():
+            (remote,) = ctx.cluster.only(role).remotes.keys()
+            log.info('Running commands on role %s host %s', role, remote.name)
+            for c in ls:
+                c.replace('$TESTDIR', testdir)
+                remote.run(
+                    args=[
+                        'sudo',
+                        'TESTDIR={tdir}'.format(tdir=testdir),
+                        'bash',
+                        '-c',
+                        c],
+                )
+
diff --git a/qa/tasks/filestore_idempotent.py b/qa/tasks/filestore_idempotent.py
new file mode 100644
index 000000000..319bef768
--- /dev/null
+++ b/qa/tasks/filestore_idempotent.py
@@ -0,0 +1,83 @@
+"""
+Filestore/filejournal handler
+"""
+import logging
+from teuthology.orchestra import run
+import random
+
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Test filestore/filejournal handling of non-idempotent events.
+
+    Currently this is a kludge; we require the ceph task precedes us just
+    so that we get the tarball installed to run the test binary.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    assert config is None or isinstance(config, list) \
+        or isinstance(config, dict), \
+        "task only supports a list or dictionary for configuration"
+    all_clients = ['client.{id}'.format(id=id_)
+                   for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    if config is None:
+        config = all_clients
+    if isinstance(config, list):
+        config = dict.fromkeys(config)
+    clients = config.keys()
+
+    # just use the first client...
+    client = next(iter(clients))
+    (remote,) = ctx.cluster.only(client).remotes.keys()
+
+    testdir = teuthology.get_testdir(ctx)
+
+    dir = '%s/ceph.data/test.%s' % (testdir, client)
+
+    seed = int(random.uniform(1,100))
+    start = 800 + random.randint(800,1200)
+    end = start + 50
+
+    try:
+        log.info('creating a working dir')
+        remote.run(args=['mkdir', dir])
+        remote.run(
+            args=[
+                'cd', dir,
+                run.Raw('&&'),
+                'wget','-q', '-Orun_seed_to.sh',
+                'http://git.ceph.com/?p=ceph.git;a=blob_plain;f=src/test/objectstore/run_seed_to.sh;hb=HEAD',
+                run.Raw('&&'),
+                'wget','-q', '-Orun_seed_to_range.sh',
+                'http://git.ceph.com/?p=ceph.git;a=blob_plain;f=src/test/objectstore/run_seed_to_range.sh;hb=HEAD',
+                run.Raw('&&'),
+                'chmod', '+x', 'run_seed_to.sh', 'run_seed_to_range.sh',
+                ]);
+
+        log.info('running a series of tests')
+        proc = remote.run(
+            args=[
+                'cd', dir,
+                run.Raw('&&'),
+                './run_seed_to_range.sh', str(seed), str(start), str(end),
+                ],
+            wait=False,
+            check_status=False)
+        result = proc.wait()
+
+        if result != 0:
+            remote.run(
+                args=[
+                    'cp', '-a', dir, '{tdir}/archive/idempotent_failure'.format(tdir=testdir),
+                    ])
+            raise Exception("./run_seed_to_range.sh errored out")
+
+    finally:
+        remote.run(args=[
+                'rm', '-rf', '--', dir
+                ])
+
diff --git a/qa/tasks/fs.py b/qa/tasks/fs.py
new file mode 100644
index 000000000..f7a9330e2
--- /dev/null
+++ b/qa/tasks/fs.py
@@ -0,0 +1,152 @@
+"""
+CephFS sub-tasks.
+"""
+
+import logging
+import re
+
+from tasks.cephfs.filesystem import Filesystem, MDSCluster
+
+log = logging.getLogger(__name__)
+
+# Everything up to CEPH_MDSMAP_ALLOW_STANDBY_REPLAY
+CEPH_MDSMAP_ALLOW_STANDBY_REPLAY = (1<<5)
+CEPH_MDSMAP_LAST = CEPH_MDSMAP_ALLOW_STANDBY_REPLAY
+UPGRADE_FLAGS_MASK = ((CEPH_MDSMAP_LAST<<1) - 1)
+def pre_upgrade_save(ctx, config):
+    """
+    That the upgrade procedure doesn't clobber state: save state.
+    """
+
+    mdsc = MDSCluster(ctx)
+    status = mdsc.status()
+
+    state = {}
+    ctx['mds-upgrade-state'] = state
+
+    for fs in list(status.get_filesystems()):
+        fscid = fs['id']
+        mdsmap = fs['mdsmap']
+        fs_state = {}
+        fs_state['epoch'] = mdsmap['epoch']
+        fs_state['max_mds'] = mdsmap['max_mds']
+        fs_state['flags'] = mdsmap['flags'] & UPGRADE_FLAGS_MASK
+        state[fscid] = fs_state
+        log.debug(f"fs fscid={fscid},name={mdsmap['fs_name']} state = {fs_state}")
+
+
+def post_upgrade_checks(ctx, config):
+    """
+    That the upgrade procedure doesn't clobber state.
+    """
+
+    state = ctx['mds-upgrade-state']
+
+    mdsc = MDSCluster(ctx)
+    status = mdsc.status()
+
+    for fs in list(status.get_filesystems()):
+        fscid = fs['id']
+        mdsmap = fs['mdsmap']
+        fs_state = state[fscid]
+        log.debug(f"checking fs fscid={fscid},name={mdsmap['fs_name']} state = {fs_state}")
+
+        # check state was restored to previous values
+        assert fs_state['max_mds'] == mdsmap['max_mds']
+        assert fs_state['flags'] == (mdsmap['flags'] & UPGRADE_FLAGS_MASK)
+
+        # now confirm that the upgrade procedure was followed
+        epoch = mdsmap['epoch']
+        pre_upgrade_epoch = fs_state['epoch']
+        assert pre_upgrade_epoch < epoch
+        should_decrease_max_mds = fs_state['max_mds'] > 1
+        did_decrease_max_mds = False
+        should_disable_allow_standby_replay = fs_state['flags'] & CEPH_MDSMAP_ALLOW_STANDBY_REPLAY
+        did_disable_allow_standby_replay = False
+        for i in range(pre_upgrade_epoch+1, mdsmap['epoch']):
+            old_status = mdsc.status(epoch=i)
+            old_fs = old_status.get_fsmap(fscid)
+            old_mdsmap = old_fs['mdsmap']
+            if should_decrease_max_mds and old_mdsmap['max_mds'] == 1:
+                log.debug(f"max_mds reduced in epoch {i}")
+                did_decrease_max_mds = True
+            if should_disable_allow_standby_replay and not (old_mdsmap['flags'] & CEPH_MDSMAP_ALLOW_STANDBY_REPLAY):
+                log.debug(f"allow_standby_replay disabled in epoch {i}")
+                did_disable_allow_standby_replay = True
+        assert not should_decrease_max_mds or did_decrease_max_mds
+        assert not should_disable_allow_standby_replay or did_disable_allow_standby_replay
+
+
+def ready(ctx, config):
+    """
+    That the file system is ready for clients.
+    """
+
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'task only accepts a dict for configuration'
+
+    timeout = config.get('timeout', 300)
+
+    mdsc = MDSCluster(ctx)
+    status = mdsc.status()
+
+    for filesystem in status.get_filesystems():
+        fs = Filesystem(ctx, fscid=filesystem['id'])
+        fs.wait_for_daemons(timeout=timeout, status=status)
+
+def clients_evicted(ctx, config):
+    """
+    Check clients are evicted, unmount (cleanup) if so.
+    """
+
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'task only accepts a dict for configuration'
+
+    clients = config.get('clients')
+
+    if clients is None:
+        clients = {("client."+client_id): True for client_id in ctx.mounts}
+
+    log.info("clients is {}".format(str(clients)))
+
+    fs = Filesystem(ctx)
+    status = fs.status()
+
+    has_session = set()
+    mounts = {}
+    for client in clients:
+        client_id = re.match("^client.([0-9]+)$", client).groups(1)[0]
+        mounts[client] = ctx.mounts.get(client_id)
+
+    for rank in fs.get_ranks(status=status):
+        ls = fs.rank_asok(['session', 'ls'], rank=rank['rank'], status=status)
+        for session in ls:
+            for client, evicted in clients.items():
+                mount = mounts.get(client)
+                if mount is not None:
+                    global_id = mount.get_global_id()
+                    if session['id'] == global_id:
+                        if evicted:
+                            raise RuntimeError("client still has session: {}".format(str(session)))
+                        else:
+                            log.info("client {} has a session with MDS {}.{}".format(client, fs.id, rank['rank']))
+                            has_session.add(client)
+
+    no_session = set(clients) - has_session
+    should_assert = False
+    for client, evicted in clients.items():
+        mount = mounts.get(client)
+        if mount is not None:
+            if evicted:
+                log.info("confirming client {} is blocklisted".format(client))
+                assert fs.is_addr_blocklisted(mount.get_global_addr())
+            elif client in no_session:
+                log.info("client {} should not be evicted but has no session with an MDS".format(client))
+                fs.is_addr_blocklisted(mount.get_global_addr()) # for debugging
+                should_assert = True
+    if should_assert:
+        raise RuntimeError("some clients which should not be evicted have no session with an MDS?")
diff --git a/qa/tasks/fwd_scrub.py b/qa/tasks/fwd_scrub.py
new file mode 100644
index 000000000..44fd97baa
--- /dev/null
+++ b/qa/tasks/fwd_scrub.py
@@ -0,0 +1,152 @@
+"""
+Thrash mds by simulating failures
+"""
+import logging
+import contextlib
+
+from gevent import sleep, GreenletExit
+from gevent.greenlet import Greenlet
+from gevent.event import Event
+from teuthology import misc as teuthology
+
+from tasks import ceph_manager
+from tasks.cephfs.filesystem import MDSCluster, Filesystem
+from tasks.thrasher import Thrasher
+
+log = logging.getLogger(__name__)
+
+class ForwardScrubber(Thrasher, Greenlet):
+    """
+    ForwardScrubber::
+
+    The ForwardScrubber does forward scrubbing of file-systems during execution
+    of other tasks (workunits, etc).
+    """
+
+    def __init__(self, fs, scrub_timeout=300, sleep_between_iterations=1):
+        super(ForwardScrubber, self).__init__()
+
+        self.logger = log.getChild('fs.[{f}]'.format(f=fs.name))
+        self.fs = fs
+        self.name = 'thrasher.fs.[{f}]'.format(f=fs.name)
+        self.stopping = Event()
+        self.scrub_timeout = scrub_timeout
+        self.sleep_between_iterations = sleep_between_iterations
+
+    def _run(self):
+        try:
+            self.do_scrub()
+        except Exception as e:
+            self.set_thrasher_exception(e)
+            self.logger.exception("exception:")
+            # allow successful completion so gevent doesn't see an exception...
+
+    def stop(self):
+        self.stopping.set()
+
+    def do_scrub(self):
+        """
+        Perform the file-system scrubbing
+        """
+        self.logger.info(f'start scrubbing fs: {self.fs.name}')
+
+        try:
+            while not self.stopping.is_set():
+                self._scrub()
+                sleep(self.sleep_between_iterations)
+        except GreenletExit:
+            pass
+
+        self.logger.info(f'end scrubbing fs: {self.fs.name}')
+
+    def _scrub(self, path="/", recursive=True):
+        self.logger.info(f"scrubbing fs: {self.fs.name}")
+        scrubopts = ["force"]
+        if recursive:
+            scrubopts.append("recursive")
+        out_json = self.fs.run_scrub(["start", path, ",".join(scrubopts)])
+        assert out_json is not None
+
+        tag = out_json['scrub_tag']
+
+        assert tag is not None
+        assert out_json['return_code'] == 0
+        assert out_json['mode'] == 'asynchronous'
+
+        return self.fs.wait_until_scrub_complete(tag=tag, sleep=30,
+                                                 timeout=self.scrub_timeout)
+
+def stop_all_fwd_scrubbers(thrashers):
+    for thrasher in thrashers:
+        if not isinstance(thrasher, ForwardScrubber):
+            continue
+        thrasher.stop()
+        thrasher.join()
+        if thrasher.exception is not None:
+            raise RuntimeError(f"error during scrub thrashing: {thrasher.exception}")
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Stress test the mds by running scrub iterations while another task/workunit
+    is running.
+    Example config:
+
+    - fwd_scrub:
+      scrub_timeout: 300
+      sleep_between_iterations: 1
+    """
+
+    mds_cluster = MDSCluster(ctx)
+
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'fwd_scrub task only accepts a dict for configuration'
+    mdslist = list(teuthology.all_roles_of_type(ctx.cluster, 'mds'))
+    assert len(mdslist) > 0, \
+        'fwd_scrub task requires at least 1 metadata server'
+
+    (first,) = ctx.cluster.only(f'mds.{mdslist[0]}').remotes.keys()
+    manager = ceph_manager.CephManager(
+        first, ctx=ctx, logger=log.getChild('ceph_manager'),
+    )
+
+    # make sure everyone is in active, standby, or standby-replay
+    log.info('Wait for all MDSs to reach steady state...')
+    status = mds_cluster.status()
+    while True:
+        steady = True
+        for info in status.get_all():
+            state = info['state']
+            if state not in ('up:active', 'up:standby', 'up:standby-replay'):
+                steady = False
+                break
+        if steady:
+            break
+        sleep(2)
+        status = mds_cluster.status()
+
+    log.info('Ready to start scrub thrashing')
+
+    manager.wait_for_clean()
+    assert manager.is_clean()
+
+    if 'cluster' not in config:
+        config['cluster'] = 'ceph'
+
+    for fs in status.get_filesystems():
+        fwd_scrubber = ForwardScrubber(Filesystem(ctx, fscid=fs['id']),
+                                       config['scrub_timeout'],
+                                       config['sleep_between_iterations'])
+        fwd_scrubber.start()
+        ctx.ceph[config['cluster']].thrashers.append(fwd_scrubber)
+
+    try:
+        log.debug('Yielding')
+        yield
+    finally:
+        log.info('joining ForwardScrubbers')
+        stop_all_fwd_scrubbers(ctx.ceph[config['cluster']].thrashers)
+        log.info('done joining')
diff --git a/qa/tasks/immutable_object_cache.py b/qa/tasks/immutable_object_cache.py
new file mode 100644
index 000000000..b8034de47
--- /dev/null
+++ b/qa/tasks/immutable_object_cache.py
@@ -0,0 +1,72 @@
+"""
+immutable object cache task
+"""
+import contextlib
+import logging
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def immutable_object_cache(ctx, config):
+    """
+    setup and cleanup immutable object cache
+    """
+    log.info("start immutable object cache daemon")
+    for client, client_config in config.items():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        # make sure that there is one immutable object cache daemon on the same node.
+        remote.run(
+            args=[
+                'sudo', 'killall', '-s', '9', 'ceph-immutable-object-cache', run.Raw('||'), 'true',
+                ]
+            )
+        remote.run(
+            args=[
+                'ceph-immutable-object-cache', '-b',
+                ]
+            )
+    try:
+        yield
+    finally:
+        log.info("check and cleanup immutable object cache")
+        for client, client_config in config.items():
+            client_config = client_config if client_config is not None else dict()
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+            cache_path = client_config.get('immutable object cache path', '/tmp/ceph-immutable-object-cache')
+            ls_command = '"$(ls {} )"'.format(cache_path)
+            remote.run(
+                args=[
+                    'test', '-n', run.Raw(ls_command),
+                    ]
+                )
+            remote.run(
+                args=[
+                    'sudo', 'killall', '-s', '9', 'ceph-immutable-object-cache', run.Raw('||'), 'true',
+                    ]
+                )
+            remote.run(
+                args=[
+                    'sudo', 'rm', '-rf', cache_path, run.Raw('||'), 'true',
+                    ]
+                )
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    This is task for start immutable_object_cache.
+    """
+    assert isinstance(config, dict), \
+           "task immutable_object_cache only supports a dictionary for configuration"
+
+    managers = []
+    config = teuthology.replace_all_with_clients(ctx.cluster, config)
+    managers.append(
+        lambda: immutable_object_cache(ctx=ctx, config=config)
+        )
+
+    with contextutil.nested(*managers):
+        yield
diff --git a/qa/tasks/immutable_object_cache_thrash.py b/qa/tasks/immutable_object_cache_thrash.py
new file mode 100644
index 000000000..0bf3ad3a0
--- /dev/null
+++ b/qa/tasks/immutable_object_cache_thrash.py
@@ -0,0 +1,79 @@
+"""
+immutable object cache thrash task
+"""
+import contextlib
+import logging
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.orchestra import run
+
+DEFAULT_KILL_DAEMON_TIME = 2
+DEFAULT_DEAD_TIME = 30
+DEFAULT_LIVE_TIME = 120
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def thrashes_immutable_object_cache_daemon(ctx, config):
+    """
+    thrashes immutable object cache daemon.
+    It can test reconnection feature of RO cache when RO daemon crash
+    TODO : replace sleep with better method.
+    """
+    log.info("thrashes immutable object cache daemon")
+
+    # just thrash one rbd client.
+    client, client_config = list(config.items())[0]
+    (remote,) = ctx.cluster.only(client).remotes.keys()
+    client_config = client_config if client_config is not None else dict()
+    kill_daemon_time = client_config.get('kill_daemon_time', DEFAULT_KILL_DAEMON_TIME)
+    dead_time = client_config.get('dead_time', DEFAULT_DEAD_TIME)
+    live_time = client_config.get('live_time', DEFAULT_LIVE_TIME)
+
+    for i in range(kill_daemon_time):
+        log.info("ceph-immutable-object-cache crash....")
+        remote.run(
+            args=[
+                'sudo', 'killall', '-s', '9', 'ceph-immutable-object-cache', run.Raw('||'), 'true',
+                 ]
+            )
+        # librbd shoud normally run when ceph-immutable-object-cache
+        remote.run(
+            args=[
+                'sleep', '{dead_time}'.format(dead_time=dead_time),
+                 ]
+            )
+        # librbd should reconnect daemon
+        log.info("startup ceph-immutable-object-cache")
+        remote.run(
+            args=[
+                'ceph-immutable-object-cache', '-b',
+                 ]
+            )
+        remote.run(
+            args=[
+                'sleep', '{live_time}'.format(live_time=live_time),
+                 ]
+            )
+    try:
+        yield
+    finally:
+        log.info("cleanup")
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    This is task for testing immutable_object_cache thrash.
+    """
+    assert isinstance(config, dict), \
+            "task immutable_object_cache_thrash only supports a dictionary for configuration"
+
+    managers = []
+    config = teuthology.replace_all_with_clients(ctx.cluster, config)
+    managers.append(
+        lambda: thrashes_immutable_object_cache_daemon(ctx=ctx, config=config)
+        )
+
+    with contextutil.nested(*managers):
+        yield
diff --git a/qa/tasks/kclient.py b/qa/tasks/kclient.py
new file mode 100644
index 000000000..d7bc9fa83
--- /dev/null
+++ b/qa/tasks/kclient.py
@@ -0,0 +1,144 @@
+"""
+Mount/unmount a ``kernel`` client.
+"""
+import contextlib
+import logging
+
+from teuthology.misc import deep_merge
+from teuthology.orchestra.run import CommandFailedError
+from teuthology import misc
+from teuthology.contextutil import MaxWhileTries
+from tasks.cephfs.kernel_mount import KernelMount
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Mount/unmount a ``kernel`` client.
+
+    The config is optional and defaults to mounting on all clients. If
+    a config is given, it is expected to be a list of clients to do
+    this operation on. This lets you e.g. set up one client with
+    ``ceph-fuse`` and another with ``kclient``.
+
+    ``brxnet`` should be a Private IPv4 Address range, default range is
+    [192.168.0.0/16]
+
+    Example that mounts all clients::
+
+        tasks:
+        - ceph:
+        - kclient:
+        - interactive:
+        - brxnet: [192.168.0.0/16]
+
+    Example that uses both ``kclient` and ``ceph-fuse``::
+
+        tasks:
+        - ceph:
+        - ceph-fuse: [client.0]
+        - kclient: [client.1]
+        - interactive:
+
+
+    Pass a dictionary instead of lists to specify per-client config:
+
+        tasks:
+        -kclient:
+            client.0:
+                debug: true
+                mntopts: ["nowsync"]
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    log.info('Mounting kernel clients...')
+
+    if config is None:
+        ids = misc.all_roles_of_type(ctx.cluster, 'client')
+        client_roles = [f'client.{id_}' for id_ in ids]
+        config = dict([r, dict()] for r in client_roles)
+    elif isinstance(config, list):
+        client_roles = config
+        config = dict([r, dict()] for r in client_roles)
+    elif isinstance(config, dict):
+        client_roles = filter(lambda x: 'client.' in x, config.keys())
+    else:
+        raise ValueError(f"Invalid config object: {config} ({config.__class__})")
+    log.info(f"config is {config}")
+
+    clients = list(misc.get_clients(ctx=ctx, roles=client_roles))
+
+    test_dir = misc.get_testdir(ctx)
+
+    for id_, remote in clients:
+        KernelMount.cleanup_stale_netnses_and_bridge(remote)
+
+    mounts = {}
+    overrides = ctx.config.get('overrides', {}).get('kclient', {})
+    top_overrides = dict(filter(lambda x: 'client.' not in x[0], overrides.items()))
+    for id_, remote in clients:
+        entity = f"client.{id_}"
+        client_config = config.get(entity)
+        if client_config is None:
+            client_config = {}
+        # top level overrides
+        deep_merge(client_config, top_overrides)
+        # mount specific overrides
+        client_config_overrides = overrides.get(entity)
+        deep_merge(client_config, client_config_overrides)
+        log.info(f"{entity} config is {client_config}")
+
+        cephfs_name = client_config.get("cephfs_name")
+        if config.get("disabled", False) or not client_config.get('mounted', True):
+            continue
+
+        kernel_mount = KernelMount(
+            ctx=ctx,
+            test_dir=test_dir,
+            client_id=id_,
+            client_remote=remote,
+            brxnet=ctx.teuthology_config.get('brxnet', None),
+            config=client_config,
+            cephfs_name=cephfs_name)
+
+        mounts[id_] = kernel_mount
+
+        if client_config.get('debug', False):
+            remote.run(args=["sudo", "bash", "-c", "echo 'module ceph +p' > /sys/kernel/debug/dynamic_debug/control"])
+            remote.run(args=["sudo", "bash", "-c", "echo 'module libceph +p' > /sys/kernel/debug/dynamic_debug/control"])
+
+        kernel_mount.mount(mntopts=client_config.get('mntopts', []))
+
+    def umount_all():
+        log.info('Unmounting kernel clients...')
+
+        forced = False
+        for mount in mounts.values():
+            if mount.is_mounted():
+                try:
+                    mount.umount()
+                except (CommandFailedError, MaxWhileTries):
+                    log.warning("Ordinary umount failed, forcing...")
+                    forced = True
+                    mount.umount_wait(force=True)
+
+        for id_, remote in clients:
+            KernelMount.cleanup_stale_netnses_and_bridge(remote)
+
+        return forced
+
+    ctx.mounts = mounts
+    try:
+        yield mounts
+    except:
+        umount_all()  # ignore forced retval, we are already in error handling
+    finally:
+
+        forced = umount_all()
+        if forced:
+            # The context managers within the kclient manager worked (i.e.
+            # the test workload passed) but for some reason we couldn't
+            # umount, so turn this into a test failure.
+            raise RuntimeError("Kernel mounts did not umount cleanly")
diff --git a/qa/tasks/keycloak.py b/qa/tasks/keycloak.py
new file mode 100644
index 000000000..055902836
--- /dev/null
+++ b/qa/tasks/keycloak.py
@@ -0,0 +1,463 @@
+"""
+Deploy and configure Keycloak for Teuthology
+"""
+import contextlib
+import logging
+import os
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.orchestra import run
+from teuthology.exceptions import ConfigError
+
+log = logging.getLogger(__name__)
+
+def get_keycloak_version(config):
+    for client, client_config in config.items():
+        if 'keycloak_version' in client_config:
+            keycloak_version = client_config.get('keycloak_version')
+    return keycloak_version
+
+def get_keycloak_dir(ctx, config):
+    keycloak_version = get_keycloak_version(config)
+    current_version = 'keycloak-'+keycloak_version
+    return '{tdir}/{ver}'.format(tdir=teuthology.get_testdir(ctx),ver=current_version)
+
+def run_in_keycloak_dir(ctx, client, config, args, **kwargs):
+    return ctx.cluster.only(client).run(
+        args=[ 'cd', get_keycloak_dir(ctx,config), run.Raw('&&'), ] + args,
+        **kwargs
+    )
+
+def get_toxvenv_dir(ctx):
+    return ctx.tox.venv_path
+
+def toxvenv_sh(ctx, remote, args, **kwargs):
+    activate = get_toxvenv_dir(ctx) + '/bin/activate'
+    return remote.sh(['source', activate, run.Raw('&&')] + args, **kwargs)
+
+@contextlib.contextmanager
+def install_packages(ctx, config):
+    """
+    Downloading the two required tar files
+    1. Keycloak
+    2. Wildfly (Application Server)
+    """
+    assert isinstance(config, dict)
+    log.info('Installing packages for Keycloak...')
+
+    for (client, _) in config.items():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        test_dir=teuthology.get_testdir(ctx)
+        current_version = get_keycloak_version(config)
+        link1 = 'https://downloads.jboss.org/keycloak/'+current_version+'/keycloak-'+current_version+'.tar.gz'
+        toxvenv_sh(ctx, remote, ['wget', link1])
+        
+        file1 = 'keycloak-'+current_version+'.tar.gz'
+        toxvenv_sh(ctx, remote, ['tar', '-C', test_dir, '-xvzf', file1])
+
+        link2 ='https://downloads.jboss.org/keycloak/'+current_version+'/adapters/keycloak-oidc/keycloak-wildfly-adapter-dist-'+current_version+'.tar.gz' 
+        toxvenv_sh(ctx, remote, ['cd', '{tdir}'.format(tdir=get_keycloak_dir(ctx,config)), run.Raw('&&'), 'wget', link2])
+        
+        file2 = 'keycloak-wildfly-adapter-dist-'+current_version+'.tar.gz'
+        toxvenv_sh(ctx, remote, ['tar', '-C', '{tdir}'.format(tdir=get_keycloak_dir(ctx,config)), '-xvzf', '{tdr}/{file}'.format(tdr=get_keycloak_dir(ctx,config),file=file2)])
+
+    try:
+        yield
+    finally:
+        log.info('Removing packaged dependencies of Keycloak...')
+        for client in config:
+            ctx.cluster.only(client).run(
+                args=['rm', '-rf', '{tdir}'.format(tdir=get_keycloak_dir(ctx,config))],
+            )
+
+@contextlib.contextmanager
+def download_conf(ctx, config):
+    """
+    Downloads confi.py used in run_admin_cmds
+    """
+    assert isinstance(config, dict)
+    log.info('Downloading conf...')
+    testdir = teuthology.get_testdir(ctx)
+    conf_branch = 'main'
+    conf_repo = 'https://github.com/TRYTOBE8TME/scripts.git'
+    for (client, _) in config.items():
+        ctx.cluster.only(client).run(
+            args=[
+                'git', 'clone',
+                '-b', conf_branch,
+                conf_repo,
+                '{tdir}/scripts'.format(tdir=testdir),
+                ],
+            )
+    try:
+        yield
+    finally:
+        log.info('Removing conf...')
+        testdir = teuthology.get_testdir(ctx)
+        for client in config:
+            ctx.cluster.only(client).run(
+                args=[
+                    'rm',
+                    '-rf',
+                    '{tdir}/scripts'.format(tdir=testdir),
+                    ],
+                )
+
+@contextlib.contextmanager
+def build(ctx,config):
+    """
+    Build process which needs to be done before starting a server.
+    """
+    assert isinstance(config, dict)
+    log.info('Building Keycloak...')
+    for (client,_) in config.items():
+        run_in_keycloak_dir(ctx, client, config,['cd', 'bin', run.Raw('&&'), './jboss-cli.sh', '--file=adapter-elytron-install-offline.cli'])
+    try:
+        yield
+    finally:
+        pass
+
+@contextlib.contextmanager
+def run_keycloak(ctx,config):
+    """
+    This includes two parts:
+    1. Adding a user to keycloak which is actually used to log in when we start the server and check in browser.
+    2. Starting the server.
+    """
+    assert isinstance(config, dict)
+    log.info('Bringing up Keycloak...')
+    for (client,_) in config.items():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        
+        ctx.cluster.only(client).run(
+            args=[
+                '{tdir}/bin/add-user-keycloak.sh'.format(tdir=get_keycloak_dir(ctx,config)),
+                '-r', 'master',
+                '-u', 'admin',
+                '-p', 'admin',
+            ],
+        )
+
+        toxvenv_sh(ctx, remote, ['cd', '{tdir}/bin'.format(tdir=get_keycloak_dir(ctx,config)), run.Raw('&&'), './standalone.sh', run.Raw('&'), 'exit'])
+    try:
+        yield
+    finally:
+        log.info('Stopping Keycloak Server...')
+
+        for (client, _) in config.items():
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+            toxvenv_sh(ctx, remote, ['cd', '{tdir}/bin'.format(tdir=get_keycloak_dir(ctx,config)), run.Raw('&&'), './jboss-cli.sh', '--connect', 'command=:shutdown'])
+
+@contextlib.contextmanager
+def run_admin_cmds(ctx,config):
+    """
+    Running Keycloak Admin commands(kcadm commands) in order to get the token, aud value, thumbprint and realm name.
+    """
+    assert isinstance(config, dict)
+    log.info('Running admin commands...')
+    for (client,_) in config.items():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+
+        remote.run(
+           args=[
+                '{tdir}/bin/kcadm.sh'.format(tdir=get_keycloak_dir(ctx,config)),
+                'config', 'credentials',
+                '--server', 'http://localhost:8080/auth',
+                '--realm', 'master',
+                '--user', 'admin',
+                '--password', 'admin',
+                '--client', 'admin-cli',
+                ],
+            )
+
+        realm_name='demorealm'
+        realm='realm={}'.format(realm_name)
+
+        remote.run(
+           args=[
+                '{tdir}/bin/kcadm.sh'.format(tdir=get_keycloak_dir(ctx,config)),
+                'create', 'realms',
+                '-s', realm,
+                '-s', 'enabled=true',
+                '-s', 'accessTokenLifespan=1800',
+                '-o',
+            ],
+        )
+
+        client_name='my_client'
+        client='clientId={}'.format(client_name)
+
+        remote.run(
+           args=[
+                '{tdir}/bin/kcadm.sh'.format(tdir=get_keycloak_dir(ctx,config)),
+                'create', 'clients',
+                '-r', realm_name,
+                '-s', client,
+                '-s', 'directAccessGrantsEnabled=true',
+                '-s', 'redirectUris=["http://localhost:8080/myapp/*"]',
+            ],
+        )
+
+        ans1= toxvenv_sh(ctx, remote, 
+                 [
+                  'cd', '{tdir}/bin'.format(tdir=get_keycloak_dir(ctx,config)), run.Raw('&&'), 
+                  './kcadm.sh', 'get', 'clients', 
+                  '-r', realm_name, 
+                  '-F', 'id,clientId', run.Raw('|'), 
+                  'jq', '-r', '.[] | select (.clientId == "my_client") | .id'
+                 ])
+
+        pre0=ans1.rstrip()
+        pre1="clients/{}".format(pre0)
+
+        remote.run(
+            args=[
+                '{tdir}/bin/kcadm.sh'.format(tdir=get_keycloak_dir(ctx,config)),
+                'update', pre1,
+                '-r', realm_name,
+                '-s', 'enabled=true',
+                '-s', 'serviceAccountsEnabled=true',
+                '-s', 'redirectUris=["http://localhost:8080/myapp/*"]',
+            ],
+        )
+
+        ans2= pre1+'/client-secret'
+
+        out2= toxvenv_sh(ctx, remote, 
+                 [
+                  'cd', '{tdir}/bin'.format(tdir=get_keycloak_dir(ctx,config)), run.Raw('&&'), 
+                  './kcadm.sh', 'get', ans2, 
+                  '-r', realm_name, 
+                  '-F', 'value'
+                 ])
+
+        ans0= '{client}:{secret}'.format(client=client_name,secret=out2[15:51])
+        ans3= 'client_secret={}'.format(out2[15:51])
+        clientid='client_id={}'.format(client_name)
+
+        proto_map = pre1+"/protocol-mappers/models"
+        uname = "username=testuser"
+        upass = "password=testuser"
+
+        remote.run(
+            args=[
+                '{tdir}/bin/kcadm.sh'.format(tdir=get_keycloak_dir(ctx,config)),
+                'create', 'users',
+                '-s', uname,
+                '-s', 'enabled=true',
+                '-s', 'attributes.\"https://aws.amazon.com/tags\"=\"{"principal_tags":{"Department":["Engineering", "Marketing"]}}\"',
+                '-r', realm_name, 
+            ],
+        )
+
+        sample = 'testuser'
+
+        remote.run(
+            args=[
+                '{tdir}/bin/kcadm.sh'.format(tdir=get_keycloak_dir(ctx,config)),
+                'set-password',
+                '-r', realm_name,
+                '--username', sample,
+                '--new-password', sample,
+            ],
+        )
+
+        file_path = '{tdir}/scripts/confi.py'.format(tdir=teuthology.get_testdir(ctx))
+
+        remote.run(
+            args=[
+                '{tdir}/bin/kcadm.sh'.format(tdir=get_keycloak_dir(ctx,config)),
+                'create', proto_map,
+                '-r', realm_name,
+                '-f', file_path,
+            ],
+        )
+
+        remote.run(
+            args=[
+                '{tdir}/bin/kcadm.sh'.format(tdir=get_keycloak_dir(ctx,config)),
+                'config', 'credentials',
+                '--server', 'http://localhost:8080/auth',
+                '--realm', realm_name,
+                '--user', sample,
+                '--password', sample,
+                '--client', 'admin-cli',
+            ],
+        )
+
+        out9= toxvenv_sh(ctx, remote,
+                 [
+                  'curl', '-k', '-v',
+                  '-X', 'POST',
+                  '-H', 'Content-Type:application/x-www-form-urlencoded',
+                  '-d', 'scope=openid',
+                  '-d', 'grant_type=password',
+                  '-d', clientid,
+                  '-d', ans3,
+                  '-d', uname,
+                  '-d', upass,
+                  'http://localhost:8080/auth/realms/'+realm_name+'/protocol/openid-connect/token', run.Raw('|'),
+                  'jq', '-r', '.access_token'
+                 ])
+
+        user_token_pre = out9.rstrip()
+        user_token = '{}'.format(user_token_pre)
+
+        out3= toxvenv_sh(ctx, remote, 
+                 [
+                  'curl', '-k', '-v', 
+                  '-X', 'POST', 
+                  '-H', 'Content-Type:application/x-www-form-urlencoded', 
+                  '-d', 'scope=openid', 
+                  '-d', 'grant_type=client_credentials', 
+                  '-d', clientid, 
+                  '-d', ans3, 
+                  'http://localhost:8080/auth/realms/'+realm_name+'/protocol/openid-connect/token', run.Raw('|'), 
+                  'jq', '-r', '.access_token'
+                 ])
+
+        pre2=out3.rstrip()
+        acc_token= 'token={}'.format(pre2)
+        ans4= '{}'.format(pre2)
+
+        out4= toxvenv_sh(ctx, remote, 
+                 [
+                  'curl', '-k', '-v', 
+                  '-X', 'GET', 
+                  '-H', 'Content-Type:application/x-www-form-urlencoded', 
+                  'http://localhost:8080/auth/realms/'+realm_name+'/protocol/openid-connect/certs', run.Raw('|'), 
+                  'jq', '-r', '.keys[].x5c[]'
+                 ])
+
+        pre3=out4.rstrip()
+        cert_value='{}'.format(pre3)
+        start_value= "-----BEGIN CERTIFICATE-----\n"
+        end_value= "\n-----END CERTIFICATE-----"
+        user_data=""
+        user_data+=start_value
+        user_data+=cert_value
+        user_data+=end_value
+
+        remote.write_file(
+            path='{tdir}/bin/certificate.crt'.format(tdir=get_keycloak_dir(ctx,config)),
+            data=user_data
+            )
+
+        out5= toxvenv_sh(ctx, remote, 
+                 [
+                  'openssl', 'x509', 
+                  '-in', '{tdir}/bin/certificate.crt'.format(tdir=get_keycloak_dir(ctx,config)), 
+                  '--fingerprint', '--noout', '-sha1'
+                 ])
+
+        pre_ans= '{}'.format(out5[17:76])
+        ans5=""
+
+        for character in pre_ans:
+            if(character!=':'):
+                ans5+=character
+
+        str1 = 'curl'
+        str2 = '-k'
+        str3 = '-v'
+        str4 = '-X'
+        str5 = 'POST'
+        str6 = '-u'
+        str7 = '-d'
+        str8 = 'http://localhost:8080/auth/realms/'+realm_name+'/protocol/openid-connect/token/introspect'
+
+        out6= toxvenv_sh(ctx, remote,
+                 [
+                  str1, str2, str3, str4, str5, str6, ans0, str7, acc_token, str8, run.Raw('|'), 'jq', '-r', '.aud'
+                 ])
+
+        out7= toxvenv_sh(ctx, remote,
+                 [ 
+                  str1, str2, str3, str4, str5, str6, ans0, str7, acc_token, str8, run.Raw('|'), 'jq', '-r', '.sub'
+                 ])
+
+        out8= toxvenv_sh(ctx, remote,
+                 [ 
+                  str1, str2, str3, str4, str5, str6, ans0, str7, acc_token, str8, run.Raw('|'), 'jq', '-r', '.azp'
+                 ])
+
+        ans6=out6.rstrip()
+        ans7=out7.rstrip()
+        ans8=out8.rstrip()
+
+        os.environ['TOKEN']=ans4
+        os.environ['THUMBPRINT']=ans5
+        os.environ['AUD']=ans6
+        os.environ['SUB']=ans7
+        os.environ['AZP']=ans8
+        os.environ['USER_TOKEN']=user_token
+        os.environ['KC_REALM']=realm_name
+
+    try:
+        yield
+    finally:
+        log.info('Removing certificate.crt file...')
+        for (client,_) in config.items():
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+            remote.run(
+                 args=['rm', '-f',
+                       '{tdir}/bin/certificate.crt'.format(tdir=get_keycloak_dir(ctx,config)),
+                 ],
+                 )
+
+            remote.run(
+                 args=['rm', '-f',
+                       '{tdir}/confi.py'.format(tdir=teuthology.get_testdir(ctx)),
+                 ],
+                 )
+
+@contextlib.contextmanager
+def task(ctx,config):
+    """
+    To run keycloak the prerequisite is to run the tox task. Following is the way how to run
+    tox and then keycloak::
+
+    tasks:
+    - tox: [ client.0 ]
+    - keycloak:
+        client.0:
+            keycloak_version: 11.0.0
+   
+    To pass extra arguments to nose (e.g. to run a certain test)::
+
+    tasks:
+    - tox: [ client.0 ]
+    - keycloak:
+        client.0:
+            keycloak_version: 11.0.0
+    - s3tests:
+        client.0:
+          extra_attrs: ['webidentity_test']
+ 
+    """
+    assert config is None or isinstance(config, list) \
+        or isinstance(config, dict), \
+        "task keycloak only supports a list or dictionary for configuration"
+
+    if not hasattr(ctx, 'tox'):
+        raise ConfigError('keycloak must run after the tox task')
+
+    all_clients = ['client.{id}'.format(id=id_)
+                   for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    if config is None:
+        config = all_clients
+    if isinstance(config, list):
+        config = dict.fromkeys(config)
+
+    log.debug('Keycloak config is %s', config)
+    
+    with contextutil.nested(
+        lambda: install_packages(ctx=ctx, config=config),
+        lambda: build(ctx=ctx, config=config),
+        lambda: run_keycloak(ctx=ctx, config=config),
+        lambda: download_conf(ctx=ctx, config=config),
+        lambda: run_admin_cmds(ctx=ctx, config=config),
+        ):
+        yield
+
diff --git a/qa/tasks/keystone.py b/qa/tasks/keystone.py
new file mode 100644
index 000000000..ad836006f
--- /dev/null
+++ b/qa/tasks/keystone.py
@@ -0,0 +1,463 @@
+"""
+Deploy and configure Keystone for Teuthology
+"""
+import argparse
+import contextlib
+import logging
+
+# still need this for python3.6
+from collections import OrderedDict
+from itertools import chain
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.orchestra import run
+from teuthology.packaging import install_package
+from teuthology.packaging import remove_package
+from teuthology.exceptions import ConfigError
+
+log = logging.getLogger(__name__)
+
+
+def get_keystone_dir(ctx):
+    return '{tdir}/keystone'.format(tdir=teuthology.get_testdir(ctx))
+
+def run_in_keystone_dir(ctx, client, args, **kwargs):
+    return ctx.cluster.only(client).run(
+        args=[ 'cd', get_keystone_dir(ctx), run.Raw('&&'), ] + args,
+        **kwargs
+    )
+
+def get_toxvenv_dir(ctx):
+    return ctx.tox.venv_path
+
+def toxvenv_sh(ctx, remote, args, **kwargs):
+    activate = get_toxvenv_dir(ctx) + '/bin/activate'
+    return remote.sh(['source', activate, run.Raw('&&')] + args, **kwargs)
+
+def run_in_keystone_venv(ctx, client, args):
+    run_in_keystone_dir(ctx, client,
+                        [   'source',
+                            '.tox/venv/bin/activate',
+                            run.Raw('&&')
+                        ] + args)
+
+def get_keystone_venved_cmd(ctx, cmd, args):
+    kbindir = get_keystone_dir(ctx) + '/.tox/venv/bin/'
+    return [ kbindir + 'python', kbindir + cmd ] + args
+
+@contextlib.contextmanager
+def download(ctx, config):
+    """
+    Download the Keystone from github.
+    Remove downloaded file upon exit.
+
+    The context passed in should be identical to the context
+    passed in to the main task.
+    """
+    assert isinstance(config, dict)
+    log.info('Downloading keystone...')
+    keystonedir = get_keystone_dir(ctx)
+
+    for (client, cconf) in config.items():
+        ctx.cluster.only(client).run(
+            args=[
+                'git', 'clone',
+                '-b', cconf.get('force-branch', 'master'),
+                'https://github.com/openstack/keystone.git',
+                keystonedir,
+                ],
+            )
+
+        sha1 = cconf.get('sha1')
+        if sha1 is not None:
+            run_in_keystone_dir(ctx, client, [
+                    'git', 'reset', '--hard', sha1,
+                ],
+            )
+
+        # hax for http://tracker.ceph.com/issues/23659
+        run_in_keystone_dir(ctx, client, [
+                'sed', '-i',
+                's/pysaml2<4.0.3,>=2.4.0/pysaml2>=4.5.0/',
+                'requirements.txt'
+            ],
+        )
+    try:
+        yield
+    finally:
+        log.info('Removing keystone...')
+        for client in config:
+            ctx.cluster.only(client).run(
+                args=[ 'rm', '-rf', keystonedir ],
+            )
+
+patch_bindep_template = """\
+import fileinput
+import sys
+import os
+fixed=False
+os.chdir("{keystone_dir}")
+for line in fileinput.input("bindep.txt", inplace=True):
+ if line == "python34-devel [platform:centos]\\n":
+  line="python34-devel [platform:centos-7]\\npython36-devel [platform:centos-8]\\n" 
+  fixed=True
+ print(line,end="")
+
+print("Fixed line" if fixed else "No fix necessary", file=sys.stderr)
+exit(0)
+"""
+
+@contextlib.contextmanager
+def install_packages(ctx, config):
+    """
+    Download the packaged dependencies of Keystone.
+    Remove install packages upon exit.
+
+    The context passed in should be identical to the context
+    passed in to the main task.
+    """
+    assert isinstance(config, dict)
+    log.info('Installing packages for Keystone...')
+
+    patch_bindep = patch_bindep_template \
+        .replace("{keystone_dir}", get_keystone_dir(ctx))
+    packages = {}
+    for (client, _) in config.items():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        toxvenv_sh(ctx, remote, ['python'], stdin=patch_bindep)
+        # use bindep to read which dependencies we need from keystone/bindep.txt
+        toxvenv_sh(ctx, remote, ['pip', 'install', 'bindep'])
+        packages[client] = toxvenv_sh(ctx, remote,
+                ['bindep', '--brief', '--file', '{}/bindep.txt'.format(get_keystone_dir(ctx))],
+                check_status=False).splitlines() # returns 1 on success?
+        for dep in packages[client]:
+            install_package(dep, remote)
+    try:
+        yield
+    finally:
+        log.info('Removing packaged dependencies of Keystone...')
+
+        for (client, _) in config.items():
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+            for dep in packages[client]:
+                remove_package(dep, remote)
+
+@contextlib.contextmanager
+def setup_venv(ctx, config):
+    """
+    Setup the virtualenv for Keystone using tox.
+    """
+    assert isinstance(config, dict)
+    log.info('Setting up virtualenv for keystone...')
+    for (client, _) in config.items():
+        run_in_keystone_dir(ctx, client,
+            [   'source',
+                '{tvdir}/bin/activate'.format(tvdir=get_toxvenv_dir(ctx)),
+                run.Raw('&&'),
+                'tox', '-e', 'venv', '--notest'
+            ])
+
+        run_in_keystone_venv(ctx, client,
+            [   'pip', 'install',
+                'python-openstackclient==5.2.1',
+                'osc-lib==2.0.0'
+             ])
+    try:
+        yield
+    finally:
+        pass
+
+@contextlib.contextmanager
+def configure_instance(ctx, config):
+    assert isinstance(config, dict)
+    log.info('Configuring keystone...')
+
+    keyrepo_dir = '{kdir}/etc/fernet-keys'.format(kdir=get_keystone_dir(ctx))
+    for (client, _) in config.items():
+        # prepare the config file
+        run_in_keystone_dir(ctx, client,
+            [
+                'source',
+                f'{get_toxvenv_dir(ctx)}/bin/activate',
+                run.Raw('&&'),
+                'tox', '-e', 'genconfig'
+            ])
+        run_in_keystone_dir(ctx, client,
+            [
+                'cp', '-f',
+                'etc/keystone.conf.sample',
+                'etc/keystone.conf'
+            ])
+        run_in_keystone_dir(ctx, client,
+            [
+                'sed',
+                '-e', 's^#key_repository =.*^key_repository = {kr}^'.format(kr = keyrepo_dir),
+                '-i', 'etc/keystone.conf'
+            ])
+        # log to a file that gets archived
+        log_file = '{p}/archive/keystone.{c}.log'.format(p=teuthology.get_testdir(ctx), c=client)
+        run_in_keystone_dir(ctx, client,
+            [
+                'sed',
+                '-e', 's^#log_file =.*^log_file = {}^'.format(log_file),
+                '-i', 'etc/keystone.conf'
+            ])
+        # copy the config to archive
+        run_in_keystone_dir(ctx, client, [
+                'cp', 'etc/keystone.conf',
+                '{}/archive/keystone.{}.conf'.format(teuthology.get_testdir(ctx), client)
+            ])
+
+        # prepare key repository for Fetnet token authenticator
+        run_in_keystone_dir(ctx, client, [ 'mkdir', '-p', keyrepo_dir ])
+        run_in_keystone_venv(ctx, client, [ 'keystone-manage', 'fernet_setup' ])
+
+        # sync database
+        run_in_keystone_venv(ctx, client, [ 'keystone-manage', 'db_sync' ])
+    yield
+
+@contextlib.contextmanager
+def run_keystone(ctx, config):
+    assert isinstance(config, dict)
+    log.info('Configuring keystone...')
+
+    for (client, _) in config.items():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        cluster_name, _, client_id = teuthology.split_role(client)
+
+        # start the public endpoint
+        client_public_with_id = 'keystone.public' + '.' + client_id
+
+        public_host, public_port = ctx.keystone.public_endpoints[client]
+        run_cmd = get_keystone_venved_cmd(ctx, 'keystone-wsgi-public',
+            [   '--host', public_host, '--port', str(public_port),
+                # Let's put the Keystone in background, wait for EOF
+                # and after receiving it, send SIGTERM to the daemon.
+                # This crazy hack is because Keystone, in contrast to
+                # our other daemons, doesn't quit on stdin.close().
+                # Teuthology relies on this behaviour.
+		run.Raw('& { read; kill %1; }')
+            ]
+        )
+        ctx.daemons.add_daemon(
+            remote, 'keystone', client_public_with_id,
+            cluster=cluster_name,
+            args=run_cmd,
+            logger=log.getChild(client),
+            stdin=run.PIPE,
+            cwd=get_keystone_dir(ctx),
+            wait=False,
+            check_status=False,
+        )
+
+        # start the admin endpoint
+        client_admin_with_id = 'keystone.admin' + '.' + client_id
+
+        admin_host, admin_port = ctx.keystone.admin_endpoints[client]
+        run_cmd = get_keystone_venved_cmd(ctx, 'keystone-wsgi-admin',
+            [   '--host', admin_host, '--port', str(admin_port),
+                run.Raw('& { read; kill %1; }')
+            ]
+        )
+        ctx.daemons.add_daemon(
+            remote, 'keystone', client_admin_with_id,
+            cluster=cluster_name,
+            args=run_cmd,
+            logger=log.getChild(client),
+            stdin=run.PIPE,
+            cwd=get_keystone_dir(ctx),
+            wait=False,
+            check_status=False,
+        )
+
+        # sleep driven synchronization
+        run_in_keystone_venv(ctx, client, [ 'sleep', '15' ])
+    try:
+        yield
+    finally:
+        log.info('Stopping Keystone admin instance')
+        ctx.daemons.get_daemon('keystone', client_admin_with_id,
+                               cluster_name).stop()
+
+        log.info('Stopping Keystone public instance')
+        ctx.daemons.get_daemon('keystone', client_public_with_id,
+                               cluster_name).stop()
+
+
+def dict_to_args(specials, items):
+    """
+    Transform
+        [(key1, val1), (special, val_special), (key3, val3) ]
+    into:
+        [ '--key1', 'val1', '--key3', 'val3', 'val_special' ]
+    """
+    args = []
+    special_vals = OrderedDict((k, '') for k in specials.split(','))
+    for (k, v) in items:
+        if k in special_vals:
+            special_vals[k] = v
+        else:
+            args.append('--{k}'.format(k=k))
+            args.append(v)
+    args.extend(arg for arg in special_vals.values() if arg)
+    return args
+
+def run_section_cmds(ctx, cclient, section_cmd, specials,
+                     section_config_list):
+    admin_host, admin_port = ctx.keystone.admin_endpoints[cclient]
+
+    auth_section = [
+        ( 'os-username', 'admin' ),
+        ( 'os-password', 'ADMIN' ),
+        ( 'os-user-domain-id', 'default' ),
+        ( 'os-project-name', 'admin' ),
+        ( 'os-project-domain-id', 'default' ),
+        ( 'os-identity-api-version', '3' ),
+        ( 'os-auth-url', 'http://{host}:{port}/v3'.format(host=admin_host,
+                                                          port=admin_port) ),
+    ]
+
+    for section_item in section_config_list:
+        run_in_keystone_venv(ctx, cclient,
+            [ 'openstack' ] + section_cmd.split() +
+            dict_to_args(specials, auth_section + list(section_item.items())) +
+            [ '--debug' ])
+
+def create_endpoint(ctx, cclient, service, url, adminurl=None):
+    endpoint_sections = [
+        {'service': service, 'interface': 'public', 'url': url},
+    ]
+    if adminurl:
+        endpoint_sections.append(
+            {'service': service, 'interface': 'admin', 'url': adminurl}
+        )
+    run_section_cmds(ctx, cclient, 'endpoint create',
+                     'service,interface,url',
+                     endpoint_sections)
+
+@contextlib.contextmanager
+def fill_keystone(ctx, config):
+    assert isinstance(config, dict)
+
+    for (cclient, cconfig) in config.items():
+        public_host, public_port = ctx.keystone.public_endpoints[cclient]
+        url = 'http://{host}:{port}/v3'.format(host=public_host,
+                                               port=public_port)
+        admin_host, admin_port = ctx.keystone.admin_endpoints[cclient]
+        admin_url = 'http://{host}:{port}/v3'.format(host=admin_host,
+                                                     port=admin_port)
+        opts = {'password': 'ADMIN',
+                'region-id': 'RegionOne',
+                'internal-url': url,
+                'admin-url': admin_url,
+                'public-url': url}
+        bootstrap_args = chain.from_iterable(('--bootstrap-{}'.format(k), v)
+                                             for k, v in opts.items())
+        run_in_keystone_venv(ctx, cclient,
+                             ['keystone-manage', 'bootstrap'] +
+                             list(bootstrap_args))
+
+        # configure tenants/projects
+        run_section_cmds(ctx, cclient, 'domain create', 'name',
+                         cconfig.get('domains', []))
+        run_section_cmds(ctx, cclient, 'project create', 'name',
+                         cconfig.get('projects', []))
+        run_section_cmds(ctx, cclient, 'user create', 'name',
+                         cconfig.get('users', []))
+        run_section_cmds(ctx, cclient, 'role create', 'name',
+                         cconfig.get('roles', []))
+        run_section_cmds(ctx, cclient, 'role add', 'name',
+                         cconfig.get('role-mappings', []))
+        run_section_cmds(ctx, cclient, 'service create', 'type',
+                         cconfig.get('services', []))
+
+        # for the deferred endpoint creation; currently it's used in rgw.py
+        ctx.keystone.create_endpoint = create_endpoint
+
+        # sleep driven synchronization -- just in case
+        run_in_keystone_venv(ctx, cclient, [ 'sleep', '3' ])
+    try:
+        yield
+    finally:
+        pass
+
+def assign_ports(ctx, config, initial_port):
+    """
+    Assign port numbers starting from @initial_port
+    """
+    port = initial_port
+    role_endpoints = {}
+    for remote, roles_for_host in ctx.cluster.remotes.items():
+        for role in roles_for_host:
+            if role in config:
+                role_endpoints[role] = (remote.name.split('@')[1], port)
+                port += 1
+
+    return role_endpoints
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Deploy and configure Keystone
+
+    Example of configuration:
+
+      - install:
+      - ceph:
+      - tox: [ client.0 ]
+      - keystone:
+          client.0:
+            force-branch: master
+            domains:
+              - name: default
+                description: Default Domain
+            projects:
+              - name: admin
+                description:  Admin Tenant
+            users:
+              - name: admin
+                password: ADMIN
+                project: admin
+            roles: [ name: admin, name: Member ]
+            role-mappings:
+              - name: admin
+                user: admin
+                project: admin
+            services:
+              - name: keystone
+                type: identity
+                description: Keystone Identity Service
+              - name: swift
+                type: object-store
+                description: Swift Service
+    """
+    assert config is None or isinstance(config, list) \
+        or isinstance(config, dict), \
+        "task keystone only supports a list or dictionary for configuration"
+
+    if not hasattr(ctx, 'tox'):
+        raise ConfigError('keystone must run after the tox task')
+
+    all_clients = ['client.{id}'.format(id=id_)
+                   for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    if config is None:
+        config = all_clients
+    if isinstance(config, list):
+        config = dict.fromkeys(config)
+
+    log.debug('Keystone config is %s', config)
+
+    ctx.keystone = argparse.Namespace()
+    ctx.keystone.public_endpoints = assign_ports(ctx, config, 5000)
+    ctx.keystone.admin_endpoints = assign_ports(ctx, config, 35357)
+
+    with contextutil.nested(
+        lambda: download(ctx=ctx, config=config),
+        lambda: install_packages(ctx=ctx, config=config),
+        lambda: setup_venv(ctx=ctx, config=config),
+        lambda: configure_instance(ctx=ctx, config=config),
+        lambda: run_keystone(ctx=ctx, config=config),
+        lambda: fill_keystone(ctx=ctx, config=config),
+        ):
+        yield
diff --git a/qa/tasks/kubeadm.py b/qa/tasks/kubeadm.py
new file mode 100644
index 000000000..9f147c6e7
--- /dev/null
+++ b/qa/tasks/kubeadm.py
@@ -0,0 +1,562 @@
+"""
+Kubernetes cluster task, deployed via kubeadm
+"""
+import argparse
+import contextlib
+import ipaddress
+import logging
+import random
+import yaml
+from io import BytesIO
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.config import config as teuth_config
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+
+def _kubectl(ctx, config, args, **kwargs):
+    cluster_name = config['cluster']
+    ctx.kubeadm[cluster_name].bootstrap_remote.run(
+        args=['kubectl'] + args,
+        **kwargs,
+    )
+
+
+def kubectl(ctx, config):
+    if isinstance(config, str):
+        config = [config]
+    assert isinstance(config, list)
+    for c in config:
+        if isinstance(c, str):
+            _kubectl(ctx, config, c.split(' '))
+        else:
+            _kubectl(ctx, config, c)
+
+
+@contextlib.contextmanager
+def preflight(ctx, config):
+    run.wait(
+        ctx.cluster.run(
+            args=[
+                'sudo', 'modprobe', 'br_netfilter',
+                run.Raw('&&'),
+                'sudo', 'sysctl', 'net.bridge.bridge-nf-call-ip6tables=1',
+                run.Raw('&&'),
+                'sudo', 'sysctl', 'net.bridge.bridge-nf-call-iptables=1',
+                run.Raw('&&'),
+                'sudo', 'sysctl', 'net.ipv4.ip_forward=1',
+                run.Raw('&&'),
+                'sudo', 'swapoff', '-a',
+            ],
+            wait=False,
+        )
+    )
+
+    # set docker cgroup driver = systemd
+    #  see https://kubernetes.io/docs/setup/production-environment/container-runtimes/#docker
+    #  see https://github.com/kubernetes/kubeadm/issues/2066
+    daemon_json = """
+{
+  "exec-opts": ["native.cgroupdriver=systemd"],
+  "log-driver": "json-file",
+  "log-opts": {
+    "max-size": "100m"
+  },
+  "storage-driver": "overlay2"
+}
+"""
+    for remote in ctx.cluster.remotes.keys():
+        remote.write_file('/etc/docker/daemon.json', daemon_json, sudo=True)
+    run.wait(
+        ctx.cluster.run(
+            args=[
+                'sudo', 'systemctl', 'restart', 'docker',
+                run.Raw('||'),
+                'true',
+            ],
+            wait=False,
+        )
+    )
+    yield
+
+
+@contextlib.contextmanager
+def kubeadm_install(ctx, config):
+    version = config.get('version', '1.21')
+
+    os_type = teuthology.get_distro(ctx)
+    os_version = teuthology.get_distro_version(ctx)
+
+    try:
+        if os_type in ['centos', 'rhel']:
+            os = f"CentOS_{os_version.split('.')[0]}"
+            log.info('Installing cri-o')
+            run.wait(
+                ctx.cluster.run(
+                    args=[
+                        'sudo',
+                        'curl', '-L', '-o',
+                        '/etc/yum.repos.d/devel:kubic:libcontainers:stable.repo',
+                        f'https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/{os}/devel:kubic:libcontainers:stable.repo',
+                        run.Raw('&&'),
+                        'sudo',
+                        'curl', '-L', '-o',
+                        f'/etc/yum.repos.d/devel:kubic:libcontainers:stable:cri-o:{version}.repo',
+                        f'https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable:/cri-o:/{version}/{os}/devel:kubic:libcontainers:stable:cri-o:{version}.repo',
+                        run.Raw('&&'),
+                        'sudo', 'dnf', 'install', '-y', 'cri-o',
+                    ],
+                    wait=False,
+                )
+            )
+
+            log.info('Installing kube{adm,ctl,let}')
+            repo = """[kubernetes]
+name=Kubernetes
+baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-$basearch
+enabled=1
+gpgcheck=1
+repo_gpgcheck=1
+gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
+"""
+            for remote in ctx.cluster.remotes.keys():
+                remote.write_file(
+                    '/etc/yum.repos.d/kubernetes.repo',
+                    repo,
+                    sudo=True,
+                )
+            run.wait(
+                ctx.cluster.run(
+                    args=[
+                        'sudo', 'dnf', 'install', '-y',
+                        'kubelet', 'kubeadm', 'kubectl',
+                        'iproute-tc', 'bridge-utils',
+                    ],
+                    wait=False,
+                )
+            )
+
+            # fix cni config
+            for remote in ctx.cluster.remotes.keys():
+                conf = """# from https://github.com/cri-o/cri-o/blob/master/tutorials/kubernetes.md#flannel-network
+{
+    "name": "crio",
+    "type": "flannel"
+}
+"""
+                remote.write_file('/etc/cni/net.d/10-crio-flannel.conf', conf, sudo=True)
+                remote.run(args=[
+                    'sudo', 'rm', '-f',
+                    '/etc/cni/net.d/87-podman-bridge.conflist',
+                    '/etc/cni/net.d/100-crio-bridge.conf',
+                ])
+
+            # start crio
+            run.wait(
+                ctx.cluster.run(
+                    args=[
+                        'sudo', 'systemctl', 'daemon-reload',
+                        run.Raw('&&'),
+                        'sudo', 'systemctl', 'enable', 'crio', '--now',
+                    ],
+                    wait=False,
+                )
+            )
+
+        elif os_type == 'ubuntu':
+            os = f"xUbuntu_{os_version}"
+            log.info('Installing kube{adm,ctl,let}')
+            run.wait(
+                ctx.cluster.run(
+                    args=[
+                        'sudo', 'apt', 'update',
+                        run.Raw('&&'),
+                        'sudo', 'apt', 'install', '-y',
+                        'apt-transport-https', 'ca-certificates', 'curl',
+                        run.Raw('&&'),
+                        'sudo', 'curl', '-fsSLo',
+                        '/usr/share/keyrings/kubernetes-archive-keyring.gpg',
+                        'https://packages.cloud.google.com/apt/doc/apt-key.gpg',
+                        run.Raw('&&'),
+                        'echo', 'deb [signed-by=/usr/share/keyrings/kubernetes-archive-keyring.gpg] https://apt.kubernetes.io/ kubernetes-xenial main',
+                        run.Raw('|'),
+                        'sudo', 'tee', '/etc/apt/sources.list.d/kubernetes.list',
+                        run.Raw('&&'),
+                        'sudo', 'apt', 'update',
+                        run.Raw('&&'),
+                        'sudo', 'apt', 'install', '-y',
+                        'kubelet', 'kubeadm', 'kubectl',
+                        'bridge-utils',
+                    ],
+                    wait=False,
+                )
+            )
+
+        else:
+            raise RuntimeError(f'unsupported distro {os_type} for cri-o')
+
+        run.wait(
+            ctx.cluster.run(
+                args=[
+                    'sudo', 'systemctl', 'enable', '--now', 'kubelet',
+                    run.Raw('&&'),
+                    'sudo', 'kubeadm', 'config', 'images', 'pull',
+                ],
+                wait=False,
+            )
+        )
+
+        yield
+
+    finally:
+        if config.get('uninstall', True):
+            log.info('Uninstalling kube{adm,let,ctl}')
+            if os_type in ['centos', 'rhel']:
+                run.wait(
+                    ctx.cluster.run(
+                        args=[
+                            'sudo', 'rm', '-f',
+                            '/etc/yum.repos.d/kubernetes.repo',
+                            run.Raw('&&'),
+                            'sudo', 'dnf', 'remove', '-y',
+                            'kubeadm', 'kubelet', 'kubectl', 'cri-o',
+                        ],
+                        wait=False
+                    )
+                )
+            elif os_type == 'ubuntu' and False:
+                run.wait(
+                    ctx.cluster.run(
+                        args=[
+                            'sudo', 'rm', '-f',
+                            '/etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list',
+                            f'/etc/apt/sources.list.d/devel:kubic:libcontainers:stable:cri-o:{version}.list',
+                            '/etc/apt/trusted.gpg.d/libcontainers-cri-o.gpg',
+                            run.Raw('&&'),
+                            'sudo', 'apt', 'remove', '-y',
+                            'kkubeadm', 'kubelet', 'kubectl', 'cri-o', 'cri-o-runc',
+                        ],
+                        wait=False,
+                    )
+                )
+
+
+@contextlib.contextmanager
+def kubeadm_init_join(ctx, config):
+    cluster_name = config['cluster']
+
+    bootstrap_remote = None
+    remotes = {}      # remote -> ip
+    for remote, roles in ctx.cluster.remotes.items():
+        for role in roles:
+            if role.startswith('host.'):
+                if not bootstrap_remote:
+                    bootstrap_remote = remote
+                if remote not in remotes:
+                    remotes[remote] = remote.ssh.get_transport().getpeername()[0]
+    if not bootstrap_remote:
+        raise RuntimeError('must define at least one host.something role')
+    ctx.kubeadm[cluster_name].bootstrap_remote = bootstrap_remote
+    ctx.kubeadm[cluster_name].remotes = remotes
+    ctx.kubeadm[cluster_name].token = 'abcdef.' + ''.join([
+        random.choice('0123456789abcdefghijklmnopqrstuvwxyz') for _ in range(16)
+    ])
+    log.info(f'Token: {ctx.kubeadm[cluster_name].token}')
+    log.info(f'Remotes: {ctx.kubeadm[cluster_name].remotes}')
+
+    try:
+        # init
+        cmd = [
+            'sudo', 'kubeadm', 'init',
+            '--node-name', ctx.kubeadm[cluster_name].bootstrap_remote.shortname,
+            '--token', ctx.kubeadm[cluster_name].token,
+            '--pod-network-cidr', str(ctx.kubeadm[cluster_name].pod_subnet),
+        ]
+        bootstrap_remote.run(args=cmd)
+
+        # join additional nodes
+        joins = []
+        for remote, ip in ctx.kubeadm[cluster_name].remotes.items():
+            if remote == bootstrap_remote:
+                continue
+            cmd = [
+                'sudo', 'kubeadm', 'join',
+                ctx.kubeadm[cluster_name].remotes[ctx.kubeadm[cluster_name].bootstrap_remote] + ':6443',
+                '--node-name', remote.shortname,
+                '--token', ctx.kubeadm[cluster_name].token,
+                '--discovery-token-unsafe-skip-ca-verification',
+            ]
+            joins.append(remote.run(args=cmd, wait=False))
+        run.wait(joins)
+        yield
+
+    except Exception as e:
+        log.exception(e)
+        raise
+
+    finally:
+        log.info('Cleaning up node')
+        run.wait(
+            ctx.cluster.run(
+                args=['sudo', 'kubeadm', 'reset', 'cleanup-node', '-f'],
+                wait=False,
+            )
+        )
+
+
+@contextlib.contextmanager
+def kubectl_config(ctx, config):
+    cluster_name = config['cluster']
+    bootstrap_remote = ctx.kubeadm[cluster_name].bootstrap_remote
+
+    ctx.kubeadm[cluster_name].admin_conf = \
+        bootstrap_remote.read_file('/etc/kubernetes/admin.conf', sudo=True)
+
+    log.info('Setting up kubectl')
+    try:
+        ctx.cluster.run(args=[
+            'mkdir', '-p', '.kube',
+            run.Raw('&&'),
+            'sudo', 'mkdir', '-p', '/root/.kube',
+        ])
+        for remote in ctx.kubeadm[cluster_name].remotes.keys():
+            remote.write_file('.kube/config', ctx.kubeadm[cluster_name].admin_conf)
+            remote.sudo_write_file('/root/.kube/config',
+                                   ctx.kubeadm[cluster_name].admin_conf)
+        yield
+
+    except Exception as e:
+        log.exception(e)
+        raise
+
+    finally:
+        log.info('Deconfiguring kubectl')
+        ctx.cluster.run(args=[
+            'rm', '-rf', '.kube',
+            run.Raw('&&'),
+            'sudo', 'rm', '-rf', '/root/.kube',
+        ])
+
+
+def map_vnet(mip):
+    for mapping in teuth_config.get('vnet', []):
+        mnet = ipaddress.ip_network(mapping['machine_subnet'])
+        vnet = ipaddress.ip_network(mapping['virtual_subnet'])
+        if vnet.prefixlen >= mnet.prefixlen:
+            log.error(f"virtual_subnet {vnet} prefix >= machine_subnet {mnet} prefix")
+            return None
+        if mip in mnet:
+            pos = list(mnet.hosts()).index(mip)
+            log.info(f"{mip} is in {mnet} at pos {pos}")
+            sub = list(vnet.subnets(32 - mnet.prefixlen))[pos]
+            return sub
+    return None
+
+
+@contextlib.contextmanager
+def allocate_pod_subnet(ctx, config):
+    """
+    Allocate a private subnet that will not collide with other test machines/clusters
+    """
+    cluster_name = config['cluster']
+    assert cluster_name == 'kubeadm', 'multiple subnets not yet implemented'
+
+    log.info('Identifying pod subnet')
+    remote = list(ctx.cluster.remotes.keys())[0]
+    ip = remote.ssh.get_transport().getpeername()[0]
+    mip = ipaddress.ip_address(ip)
+    vnet = map_vnet(mip)
+    assert vnet
+    log.info(f'Pod subnet: {vnet}')
+    ctx.kubeadm[cluster_name].pod_subnet = vnet
+    yield
+
+
+@contextlib.contextmanager
+def pod_network(ctx, config):
+    cluster_name = config['cluster']
+    pnet = config.get('pod_network', 'calico')
+    if pnet == 'flannel':
+        r = ctx.kubeadm[cluster_name].bootstrap_remote.run(
+            args=[
+                'curl',
+                'https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml',
+            ],
+            stdout=BytesIO(),
+        )
+        assert r.exitstatus == 0
+        flannel = list(yaml.load_all(r.stdout.getvalue(), Loader=yaml.FullLoader))
+        for o in flannel:
+            if o.get('data', {}).get('net-conf.json'):
+                log.info(f'Updating {o}')
+                o['data']['net-conf.json'] = o['data']['net-conf.json'].replace(
+                    '10.244.0.0/16',
+                    str(ctx.kubeadm[cluster_name].pod_subnet)
+                )
+                log.info(f'Now {o}')
+        flannel_yaml = yaml.dump_all(flannel)
+        log.debug(f'Flannel:\n{flannel_yaml}')
+        _kubectl(ctx, config, ['apply', '-f', '-'], stdin=flannel_yaml)
+
+    elif pnet == 'calico':
+        _kubectl(ctx, config, [
+            'create', '-f',
+            'https://docs.projectcalico.org/manifests/tigera-operator.yaml'
+        ])
+        cr = {
+            'apiVersion': 'operator.tigera.io/v1',
+            'kind': 'Installation',
+            'metadata': {'name': 'default'},
+            'spec': {
+                'calicoNetwork': {
+                    'ipPools': [
+                        {
+                            'blockSize': 26,
+                            'cidr': str(ctx.kubeadm[cluster_name].pod_subnet),
+                            'encapsulation': 'VXLANCrossSubnet',
+                            'natOutgoing': 'Enabled',
+                            'nodeSelector': 'all()',
+                        }
+                    ]
+                }
+            }
+        }
+        _kubectl(ctx, config, ['create', '-f', '-'], stdin=yaml.dump(cr))
+
+    else:
+        raise RuntimeError(f'unrecognized pod_network {pnet}')
+
+    try:
+        yield
+
+    finally:
+        if pnet == 'flannel':
+            _kubectl(ctx, config, [
+                'delete', '-f',
+                'https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml',
+            ])
+
+        elif pnet == 'calico':
+            _kubectl(ctx, config, ['delete', 'installation', 'default'])
+            _kubectl(ctx, config, [
+                'delete', '-f',
+                'https://docs.projectcalico.org/manifests/tigera-operator.yaml'
+            ])
+
+
+@contextlib.contextmanager
+def setup_pvs(ctx, config):
+    """
+    Create PVs for all scratch LVs and set up a trivial provisioner
+    """
+    log.info('Scanning for scratch devices')
+    crs = []
+    for remote in ctx.cluster.remotes.keys():
+        ls = remote.read_file('/scratch_devs').decode('utf-8').strip().splitlines()
+        log.info(f'Scratch devices on {remote.shortname}: {ls}')
+        for dev in ls:
+            devname = dev.split('/')[-1].replace("_", "-")
+            crs.append({
+                'apiVersion': 'v1',
+                'kind': 'PersistentVolume',
+                'metadata': {'name': f'{remote.shortname}-{devname}'},
+                'spec': {
+                    'volumeMode': 'Block',
+                    'accessModes': ['ReadWriteOnce'],
+                    'capacity': {'storage': '100Gi'},  # doesn't matter?
+                    'persistentVolumeReclaimPolicy': 'Recycle',
+                    'storageClassName': 'scratch',
+                    'local': {'path': dev},
+                    'nodeAffinity': {
+                        'required': {
+                            'nodeSelectorTerms': [
+                                {
+                                    'matchExpressions': [
+                                        {
+                                            'key': 'kubernetes.io/hostname',
+                                            'operator': 'In',
+                                            'values': [remote.shortname]
+                                        }
+                                    ]
+                                }
+                            ]
+                        }
+                    }
+                }
+            })
+            # overwriting first few MB is enough to make k8s happy
+            remote.run(args=[
+                'sudo', 'dd', 'if=/dev/zero', f'of={dev}', 'bs=1M', 'count=10'
+            ])
+    crs.append({
+        'kind': 'StorageClass',
+        'apiVersion': 'storage.k8s.io/v1',
+        'metadata': {'name': 'scratch'},
+        'provisioner': 'kubernetes.io/no-provisioner',
+        'volumeBindingMode': 'WaitForFirstConsumer',
+    })
+    y = yaml.dump_all(crs)
+    log.info('Creating PVs + StorageClass')
+    log.debug(y)
+    _kubectl(ctx, config, ['create', '-f', '-'], stdin=y)
+
+    yield
+
+
+@contextlib.contextmanager
+def final(ctx, config):
+    cluster_name = config['cluster']
+
+    # remove master node taint
+    _kubectl(ctx, config, [
+        'taint', 'node',
+        ctx.kubeadm[cluster_name].bootstrap_remote.shortname,
+        'node-role.kubernetes.io/master-',
+        run.Raw('||'),
+        'true',
+    ])
+
+    yield
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    if not config:
+        config = {}
+    assert isinstance(config, dict), \
+        "task only supports a dictionary for configuration"
+
+    log.info('Kubeadm start')
+
+    overrides = ctx.config.get('overrides', {})
+    teuthology.deep_merge(config, overrides.get('kubeadm', {}))
+    log.info('Config: ' + str(config))
+
+    # set up cluster context
+    if not hasattr(ctx, 'kubeadm'):
+        ctx.kubeadm = {}
+    if 'cluster' not in config:
+        config['cluster'] = 'kubeadm'
+    cluster_name = config['cluster']
+    if cluster_name not in ctx.kubeadm:
+        ctx.kubeadm[cluster_name] = argparse.Namespace()
+
+    with contextutil.nested(
+            lambda: preflight(ctx, config),
+            lambda: allocate_pod_subnet(ctx, config),
+            lambda: kubeadm_install(ctx, config),
+            lambda: kubeadm_init_join(ctx, config),
+            lambda: kubectl_config(ctx, config),
+            lambda: pod_network(ctx, config),
+            lambda: setup_pvs(ctx, config),
+            lambda: final(ctx, config),
+    ):
+        try:
+            log.info('Kubeadm complete, yielding')
+            yield
+
+        finally:
+            log.info('Tearing down kubeadm')
diff --git a/qa/tasks/locktest.py b/qa/tasks/locktest.py
new file mode 100755
index 000000000..9de5ba40c
--- /dev/null
+++ b/qa/tasks/locktest.py
@@ -0,0 +1,134 @@
+"""
+locktests
+"""
+import logging
+
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Run locktests, from the xfstests suite, on the given
+    clients. Whether the clients are ceph-fuse or kernel does not
+    matter, and the two clients can refer to the same mount.
+
+    The config is a list of two clients to run the locktest on. The
+    first client will be the host.
+
+    For example:
+       tasks:
+       - ceph:
+       - ceph-fuse: [client.0, client.1]
+       - locktest:
+           [client.0, client.1]
+
+    This task does not yield; there would be little point.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+
+    assert isinstance(config, list)
+    log.info('fetching and building locktests...')
+    (host,) = ctx.cluster.only(config[0]).remotes
+    (client,) = ctx.cluster.only(config[1]).remotes
+    ( _, _, host_id) = config[0].partition('.')
+    ( _, _, client_id) = config[1].partition('.')
+    testdir = teuthology.get_testdir(ctx)
+    hostmnt = '{tdir}/mnt.{id}'.format(tdir=testdir, id=host_id)
+    clientmnt = '{tdir}/mnt.{id}'.format(tdir=testdir, id=client_id)
+
+    try:
+        for client_name in config:
+            log.info('building on {client_}'.format(client_=client_name))
+            ctx.cluster.only(client_name).run(
+                args=[
+                    # explicitly does not support multiple autotest tasks
+                    # in a single run; the result archival would conflict
+                    'mkdir', '{tdir}/archive/locktest'.format(tdir=testdir),
+                    run.Raw('&&'),
+                    'mkdir', '{tdir}/locktest'.format(tdir=testdir),
+                    run.Raw('&&'),
+                    'wget',
+                    '-nv',
+                    'https://raw.github.com/gregsfortytwo/xfstests-ceph/master/src/locktest.c',
+                    '-O', '{tdir}/locktest/locktest.c'.format(tdir=testdir),
+                    run.Raw('&&'),
+                    'g++', '{tdir}/locktest/locktest.c'.format(tdir=testdir),
+                    '-o', '{tdir}/locktest/locktest'.format(tdir=testdir)
+                    ],
+                logger=log.getChild('locktest_client.{id}'.format(id=client_name)),
+                )
+
+        log.info('built locktest on each client')
+
+        host.run(args=['sudo', 'touch',
+                       '{mnt}/locktestfile'.format(mnt=hostmnt),
+                       run.Raw('&&'),
+                       'sudo', 'chown', 'ubuntu.ubuntu',
+                       '{mnt}/locktestfile'.format(mnt=hostmnt)
+                       ]
+                 )
+
+        log.info('starting on host')
+        hostproc = host.run(
+            args=[
+                '{tdir}/locktest/locktest'.format(tdir=testdir),
+                '-p', '6788',
+                '-d',
+                '{mnt}/locktestfile'.format(mnt=hostmnt),
+                ],
+            wait=False,
+            logger=log.getChild('locktest.host'),
+            )
+        log.info('starting on client')
+        (_,_,hostaddr) = host.name.partition('@')
+        clientproc = client.run(
+            args=[
+                '{tdir}/locktest/locktest'.format(tdir=testdir),
+                '-p', '6788',
+                '-d',
+                '-h', hostaddr,
+                '{mnt}/locktestfile'.format(mnt=clientmnt),
+                ],
+            logger=log.getChild('locktest.client'),
+            wait=False
+            )
+
+        hostresult = hostproc.wait()
+        clientresult = clientproc.wait()
+        if (hostresult != 0) or (clientresult != 0):
+            raise Exception("Did not pass locking test!")
+        log.info('finished locktest executable with results {r} and {s}'. \
+                     format(r=hostresult, s=clientresult))
+
+    finally:
+        log.info('cleaning up host dir')
+        host.run(
+            args=[
+                'mkdir', '-p', '{tdir}/locktest'.format(tdir=testdir),
+                run.Raw('&&'),
+                'rm', '-f', '{tdir}/locktest/locktest.c'.format(tdir=testdir),
+                run.Raw('&&'),
+                'rm', '-f', '{tdir}/locktest/locktest'.format(tdir=testdir),
+                run.Raw('&&'),
+                'rmdir', '{tdir}/locktest'
+                ],
+            logger=log.getChild('.{id}'.format(id=config[0])),
+            )
+        log.info('cleaning up client dir')
+        client.run(
+            args=[
+                'mkdir', '-p', '{tdir}/locktest'.format(tdir=testdir),
+                run.Raw('&&'),
+                'rm', '-f', '{tdir}/locktest/locktest.c'.format(tdir=testdir),
+                run.Raw('&&'),
+                'rm', '-f', '{tdir}/locktest/locktest'.format(tdir=testdir),
+                run.Raw('&&'),
+                'rmdir', '{tdir}/locktest'.format(tdir=testdir)
+                ],
+            logger=log.getChild('.{id}'.format(\
+                    id=config[1])),
+            )
diff --git a/qa/tasks/logrotate.conf b/qa/tasks/logrotate.conf
new file mode 100644
index 000000000..b0cb8012f
--- /dev/null
+++ b/qa/tasks/logrotate.conf
@@ -0,0 +1,13 @@
+/var/log/ceph/*{daemon_type}*.log {{
+    rotate 100
+    size {max_size}
+    compress
+    sharedscripts
+    postrotate
+        killall {daemon_type} -1 || true
+    endscript
+    missingok
+    notifempty
+    su root root
+}}
+
diff --git a/qa/tasks/lost_unfound.py b/qa/tasks/lost_unfound.py
new file mode 100644
index 000000000..5a9142a70
--- /dev/null
+++ b/qa/tasks/lost_unfound.py
@@ -0,0 +1,180 @@
+"""
+Lost_unfound
+"""
+import logging
+import time
+from tasks import ceph_manager
+from tasks.util.rados import rados
+from teuthology import misc as teuthology
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Test handling of lost objects.
+
+    A pretty rigid cluster is brought up and tested by this task
+    """
+    POOL = 'unfound_pool'
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'lost_unfound task only accepts a dict for configuration'
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+
+    manager.wait_for_clean()
+
+    manager.create_pool(POOL)
+
+    # something that is always there
+    dummyfile = '/etc/fstab'
+
+    # take an osd out until the very end
+    manager.kill_osd(2)
+    manager.mark_down_osd(2)
+    manager.mark_out_osd(2)
+
+    # kludge to make sure they get a map
+    rados(ctx, mon, ['-p', POOL, 'put', 'dummy', dummyfile])
+
+    manager.flush_pg_stats([0, 1])
+    manager.wait_for_recovery()
+
+    # create old objects
+    for f in range(1, 10):
+        rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', POOL, 'rm', 'existed_%d' % f])
+
+    # delay recovery, and make the pg log very long (to prevent backfill)
+    manager.raw_cluster_cmd(
+            'tell', 'osd.1',
+            'injectargs',
+            '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000'
+            )
+
+    manager.kill_osd(0)
+    manager.mark_down_osd(0)
+    
+    for f in range(1, 10):
+        rados(ctx, mon, ['-p', POOL, 'put', 'new_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile])
+
+    # bring osd.0 back up, let it peer, but don't replicate the new
+    # objects...
+    log.info('osd.0 command_args is %s' % 'foo')
+    log.info(ctx.daemons.get_daemon('osd', 0).command_args)
+    ctx.daemons.get_daemon('osd', 0).command_kwargs['args'].extend([
+            '--osd-recovery-delay-start', '1000'
+            ])
+    manager.revive_osd(0)
+    manager.mark_in_osd(0)
+    manager.wait_till_osd_is_up(0)
+
+    manager.flush_pg_stats([1, 0])
+    manager.wait_till_active()
+
+    # take out osd.1 and the only copy of those objects.
+    manager.kill_osd(1)
+    manager.mark_down_osd(1)
+    manager.mark_out_osd(1)
+    manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it')
+
+    # bring up osd.2 so that things would otherwise, in theory, recovery fully
+    manager.revive_osd(2)
+    manager.mark_in_osd(2)
+    manager.wait_till_osd_is_up(2)
+
+    manager.flush_pg_stats([0, 2])
+    manager.wait_till_active()
+    manager.flush_pg_stats([0, 2])
+
+    # verify that there are unfound objects
+    unfound = manager.get_num_unfound_objects()
+    log.info("there are %d unfound objects" % unfound)
+    assert unfound
+
+    testdir = teuthology.get_testdir(ctx)
+    procs = []
+    if config.get('parallel_bench', True):
+        procs.append(mon.run(
+            args=[
+                "/bin/sh", "-c",
+                " ".join(['adjust-ulimits',
+                          'ceph-coverage',
+                          '{tdir}/archive/coverage',
+                          'rados',
+                          '--no-log-to-stderr',
+                          '--name', 'client.admin',
+                          '-b', str(4<<10),
+                          '-p' , POOL,
+                          '-t', '20',
+                          'bench', '240', 'write',
+                      ]).format(tdir=testdir),
+            ],
+            logger=log.getChild('radosbench.{id}'.format(id='client.admin')),
+            stdin=run.PIPE,
+            wait=False
+        ))
+    time.sleep(10)
+
+    # mark stuff lost
+    pgs = manager.get_pg_stats()
+    for pg in pgs:
+        if pg['stat_sum']['num_objects_unfound'] > 0:
+            primary = 'osd.%d' % pg['acting'][0]
+
+            # verify that i can list them direct from the osd
+            log.info('listing missing/lost in %s state %s', pg['pgid'],
+                     pg['state']);
+            m = manager.list_pg_unfound(pg['pgid'])
+            #log.info('%s' % m)
+            assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound']
+            assert m['available_might_have_unfound'] == True
+            assert m['might_have_unfound'][0]['osd'] == "1"
+            assert m['might_have_unfound'][0]['status'] == "osd is down"
+            num_unfound=0
+            for o in m['objects']:
+                if len(o['locations']) == 0:
+                    num_unfound += 1
+            assert m['num_unfound'] == num_unfound
+
+            log.info("reverting unfound in %s on %s", pg['pgid'], primary)
+            manager.raw_cluster_cmd('pg', pg['pgid'],
+                                    'mark_unfound_lost', 'revert')
+        else:
+            log.info("no unfound in %s", pg['pgid'])
+
+    manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5')
+    manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5')
+    manager.flush_pg_stats([0, 2])
+    manager.wait_for_recovery()
+
+    # verify result
+    for f in range(1, 10):
+        err = rados(ctx, mon, ['-p', POOL, 'get', 'new_%d' % f, '-'])
+        assert err
+        err = rados(ctx, mon, ['-p', POOL, 'get', 'existed_%d' % f, '-'])
+        assert err
+        err = rados(ctx, mon, ['-p', POOL, 'get', 'existing_%d' % f, '-'])
+        assert not err
+
+    # see if osd.1 can cope
+    manager.mark_in_osd(1)
+    manager.revive_osd(1)
+    manager.wait_till_osd_is_up(1)
+    manager.wait_for_clean()
+    run.wait(procs)
+    manager.wait_for_clean()
diff --git a/qa/tasks/manypools.py b/qa/tasks/manypools.py
new file mode 100644
index 000000000..7fe7e43e1
--- /dev/null
+++ b/qa/tasks/manypools.py
@@ -0,0 +1,73 @@
+"""
+Force pg creation on all osds
+"""
+from teuthology import misc as teuthology
+from teuthology.orchestra import run
+import logging
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Create the specified number of pools and write 16 objects to them (thereby forcing
+    the PG creation on each OSD). This task creates pools from all the clients,
+    in parallel. It is easy to add other daemon types which have the appropriate
+    permissions, but I don't think anything else does.
+    The config is just the number of pools to create. I recommend setting
+    "mon create pg interval" to a very low value in your ceph config to speed
+    this up.
+    
+    You probably want to do this to look at memory consumption, and
+    maybe to test how performance changes with the number of PGs. For example:
+    
+    tasks:
+    - ceph:
+        config:
+          mon:
+            mon create pg interval: 1
+    - manypools: 3000
+    - radosbench:
+        clients: [client.0]
+        time: 360
+    """
+    
+    log.info('creating {n} pools'.format(n=config))
+    
+    poolnum = int(config)
+    creator_remotes = []
+    client_roles = teuthology.all_roles_of_type(ctx.cluster, 'client')
+    log.info('got client_roles={client_roles_}'.format(client_roles_=client_roles))
+    for role in client_roles:
+        log.info('role={role_}'.format(role_=role))
+        (creator_remote, ) = ctx.cluster.only('client.{id}'.format(id=role)).remotes.keys()
+        creator_remotes.append((creator_remote, 'client.{id}'.format(id=role)))
+
+    remaining_pools = poolnum
+    poolprocs=dict()
+    while (remaining_pools > 0):
+        log.info('{n} pools remaining to create'.format(n=remaining_pools))
+        for remote, role_ in creator_remotes:
+            poolnum = remaining_pools
+            remaining_pools -= 1
+            if remaining_pools < 0:
+                continue
+            log.info('creating pool{num} on {role}'.format(num=poolnum, role=role_))
+            proc = remote.run(
+                args=[
+                    'ceph',
+                    '--name', role_,
+                    'osd', 'pool', 'create', 'pool{num}'.format(num=poolnum), '8',
+                    run.Raw('&&'),
+                    'rados',
+                    '--name', role_,
+                    '--pool', 'pool{num}'.format(num=poolnum),
+                    'bench', '0', 'write', '-t', '16', '--block-size', '1'
+                ],
+                wait = False
+            )
+            log.info('waiting for pool and object creates')
+            poolprocs[remote] = proc
+
+        run.wait(poolprocs.values())
+
+    log.info('created all {n} pools and wrote 16 objects to each'.format(n=poolnum))
diff --git a/qa/tasks/mds_creation_failure.py b/qa/tasks/mds_creation_failure.py
new file mode 100644
index 000000000..58314086c
--- /dev/null
+++ b/qa/tasks/mds_creation_failure.py
@@ -0,0 +1,69 @@
+# FIXME: this file has many undefined vars which are accessed!
+# flake8: noqa
+import logging
+import contextlib
+import time
+from tasks import ceph_manager
+from teuthology import misc
+from teuthology.orchestra.run import CommandFailedError, Raw
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Go through filesystem creation with a synthetic failure in an MDS
+    in its 'up:creating' state, to exercise the retry behaviour.
+    """
+    # Grab handles to the teuthology objects of interest
+    mdslist = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
+    if len(mdslist) != 1:
+        # Require exactly one MDS, the code path for creation failure when
+        # a standby is available is different
+        raise RuntimeError("This task requires exactly one MDS")
+
+    mds_id = mdslist[0]
+    (mds_remote,) = ctx.cluster.only('mds.{_id}'.format(_id=mds_id)).remotes.keys()
+    manager = ceph_manager.CephManager(
+        mds_remote, ctx=ctx, logger=log.getChild('ceph_manager'),
+    )
+
+    # Stop MDS
+    self.fs.set_max_mds(0)
+    self.fs.mds_stop(mds_id)
+    self.fs.mds_fail(mds_id)
+
+    # Reset the filesystem so that next start will go into CREATING
+    manager.raw_cluster_cmd('fs', 'rm', "default", "--yes-i-really-mean-it")
+    manager.raw_cluster_cmd('fs', 'new', "default", "metadata", "data")
+
+    # Start the MDS with mds_kill_create_at set, it will crash during creation
+    mds.restart_with_args(["--mds_kill_create_at=1"])
+    try:
+        mds.wait_for_exit()
+    except CommandFailedError as e:
+        if e.exitstatus == 1:
+            log.info("MDS creation killed as expected")
+        else:
+            log.error("Unexpected status code %s" % e.exitstatus)
+            raise
+
+    # Since I have intentionally caused a crash, I will clean up the resulting core
+    # file to avoid task.internal.coredump seeing it as a failure.
+    log.info("Removing core file from synthetic MDS failure")
+    mds_remote.run(args=['rm', '-f', Raw("{archive}/coredump/*.core".format(archive=misc.get_archive_dir(ctx)))])
+
+    # It should have left the MDS map state still in CREATING
+    status = self.fs.status().get_mds(mds_id)
+    assert status['state'] == 'up:creating'
+
+    # Start the MDS again without the kill flag set, it should proceed with creation successfully
+    mds.restart()
+
+    # Wait for state ACTIVE
+    self.fs.wait_for_state("up:active", timeout=120, mds_id=mds_id)
+
+    # The system should be back up in a happy healthy state, go ahead and run any further tasks
+    # inside this context.
+    yield
diff --git a/qa/tasks/mds_pre_upgrade.py b/qa/tasks/mds_pre_upgrade.py
new file mode 100644
index 000000000..812d402ed
--- /dev/null
+++ b/qa/tasks/mds_pre_upgrade.py
@@ -0,0 +1,27 @@
+"""
+Prepare MDS cluster for upgrade.
+"""
+
+import logging
+
+from tasks.cephfs.filesystem import Filesystem
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Prepare MDS cluster for upgrade.
+
+    This task reduces ranks to 1 and stops all standbys.
+    """
+
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'snap-upgrade task only accepts a dict for configuration'
+
+    fs = Filesystem(ctx)
+    fs.getinfo() # load name
+    fs.set_allow_standby_replay(False)
+    fs.set_max_mds(1)
+    fs.reach_max_mds()
diff --git a/qa/tasks/mds_thrash.py b/qa/tasks/mds_thrash.py
new file mode 100644
index 000000000..8c7f3cba5
--- /dev/null
+++ b/qa/tasks/mds_thrash.py
@@ -0,0 +1,434 @@
+"""
+Thrash mds by simulating failures
+"""
+import logging
+import contextlib
+import itertools
+import random
+import time
+
+from gevent import sleep
+from gevent.greenlet import Greenlet
+from gevent.event import Event
+from teuthology import misc as teuthology
+
+from tasks import ceph_manager
+from tasks.cephfs.filesystem import MDSCluster, Filesystem, FSMissing
+from tasks.thrasher import Thrasher
+
+log = logging.getLogger(__name__)
+
+class MDSThrasher(Thrasher, Greenlet):
+    """
+    MDSThrasher::
+
+    The MDSThrasher thrashes MDSs during execution of other tasks (workunits, etc).
+
+    The config is optional.  Many of the config parameters are a a maximum value
+    to use when selecting a random value from a range.  To always use the maximum
+    value, set no_random to true.  The config is a dict containing some or all of:
+
+    max_thrash: [default: 1] the maximum number of active MDSs per FS that will be thrashed at
+      any given time.
+
+    max_thrash_delay: [default: 30] maximum number of seconds to delay before
+      thrashing again.
+
+    max_replay_thrash_delay: [default: 4] maximum number of seconds to delay while in
+      the replay state before thrashing.
+
+    max_revive_delay: [default: 10] maximum number of seconds to delay before
+      bringing back a thrashed MDS.
+
+    randomize: [default: true] enables randomization and use the max/min values
+
+    seed: [no default] seed the random number generator
+
+    thrash_in_replay: [default: 0.0] likelihood that the MDS will be thrashed
+      during replay.  Value should be between 0.0 and 1.0.
+
+    thrash_max_mds: [default: 0.05] likelihood that the max_mds of the mds
+      cluster will be modified to a value [1, current) or (current, starting
+      max_mds]. Value should be between 0.0 and 1.0.
+
+    thrash_while_stopping: [default: false] thrash an MDS while there
+      are MDS in up:stopping (because max_mds was changed and some
+      MDS were deactivated).
+
+    thrash_weights: allows specific MDSs to be thrashed more/less frequently.
+      This option overrides anything specified by max_thrash.  This option is a
+      dict containing mds.x: weight pairs.  For example, [mds.a: 0.7, mds.b:
+      0.3, mds.c: 0.0].  Each weight is a value from 0.0 to 1.0.  Any MDSs not
+      specified will be automatically given a weight of 0.0 (not thrashed).
+      For a given MDS, by default the trasher delays for up to
+      max_thrash_delay, trashes, waits for the MDS to recover, and iterates.
+      If a non-zero weight is specified for an MDS, for each iteration the
+      thrasher chooses whether to thrash during that iteration based on a
+      random value [0-1] not exceeding the weight of that MDS.
+
+    Examples::
+
+
+      The following example sets the likelihood that mds.a will be thrashed
+      to 80%, mds.b to 20%, and other MDSs will not be thrashed.  It also sets the
+      likelihood that an MDS will be thrashed in replay to 40%.
+      Thrash weights do not have to sum to 1.
+
+      tasks:
+      - ceph:
+      - mds_thrash:
+          thrash_weights:
+            - mds.a: 0.8
+            - mds.b: 0.2
+          thrash_in_replay: 0.4
+      - ceph-fuse:
+      - workunit:
+          clients:
+            all: [suites/fsx.sh]
+
+      The following example disables randomization, and uses the max delay values:
+
+      tasks:
+      - ceph:
+      - mds_thrash:
+          max_thrash_delay: 10
+          max_revive_delay: 1
+          max_replay_thrash_delay: 4
+
+    """
+
+    def __init__(self, ctx, manager, config, fs, max_mds):
+        super(MDSThrasher, self).__init__()
+
+        self.config = config
+        self.ctx = ctx
+        self.logger = log.getChild('fs.[{f}]'.format(f = fs.name))
+        self.fs = fs
+        self.manager = manager
+        self.max_mds = max_mds
+        self.name = 'thrasher.fs.[{f}]'.format(f = fs.name)
+        self.stopping = Event()
+
+        self.randomize = bool(self.config.get('randomize', True))
+        self.thrash_max_mds = float(self.config.get('thrash_max_mds', 0.05))
+        self.max_thrash = int(self.config.get('max_thrash', 1))
+        self.max_thrash_delay = float(self.config.get('thrash_delay', 120.0))
+        self.thrash_in_replay = float(self.config.get('thrash_in_replay', False))
+        assert self.thrash_in_replay >= 0.0 and self.thrash_in_replay <= 1.0, 'thrash_in_replay ({v}) must be between [0.0, 1.0]'.format(
+            v=self.thrash_in_replay)
+        self.max_replay_thrash_delay = float(self.config.get('max_replay_thrash_delay', 4.0))
+        self.max_revive_delay = float(self.config.get('max_revive_delay', 10.0))
+
+    def _run(self):
+        try:
+            self.do_thrash()
+        except FSMissing:
+            pass
+        except Exception as e:
+            # Log exceptions here so we get the full backtrace (gevent loses them).
+            # Also allow successful completion as gevent exception handling is a broken mess:
+            #
+            # 2017-02-03T14:34:01.259 CRITICAL:root:  File "gevent.libev.corecext.pyx", line 367, in gevent.libev.corecext.loop.handle_error (src/gevent/libev/gevent.corecext.c:5051)
+            #   File "/home/teuthworker/src/git.ceph.com_git_teuthology_master/virtualenv/local/lib/python2.7/site-packages/gevent/hub.py", line 558, in handle_error
+            #     self.print_exception(context, type, value, tb)
+            #   File "/home/teuthworker/src/git.ceph.com_git_teuthology_master/virtualenv/local/lib/python2.7/site-packages/gevent/hub.py", line 605, in print_exception
+            #     traceback.print_exception(type, value, tb, file=errstream)
+            #   File "/usr/lib/python2.7/traceback.py", line 124, in print_exception
+            #     _print(file, 'Traceback (most recent call last):')
+            #   File "/usr/lib/python2.7/traceback.py", line 13, in _print
+            #     file.write(str+terminator)
+            # 2017-02-03T14:34:01.261 CRITICAL:root:IOError
+            self.set_thrasher_exception(e)
+            self.logger.exception("exception:")
+            # allow successful completion so gevent doesn't see an exception...
+
+    def log(self, x):
+        """Write data to the logger assigned to MDSThrasher"""
+        self.logger.info(x)
+
+    def stop(self):
+        self.stopping.set()
+
+    def kill_mds(self, mds):
+        if self.config.get('powercycle'):
+            (remote,) = (self.ctx.cluster.only('mds.{m}'.format(m=mds)).
+                         remotes.keys())
+            self.log('kill_mds on mds.{m} doing powercycle of {s}'.
+                     format(m=mds, s=remote.name))
+            self._assert_ipmi(remote)
+            remote.console.power_off()
+        else:
+            self.ctx.daemons.get_daemon('mds', mds).stop()
+
+    @staticmethod
+    def _assert_ipmi(remote):
+        assert remote.console.has_ipmi_credentials, (
+            "powercycling requested but RemoteConsole is not "
+            "initialized.  Check ipmi config.")
+
+    def revive_mds(self, mds):
+        """
+        Revive mds -- do an ipmpi powercycle (if indicated by the config)
+        and then restart.
+        """
+        if self.config.get('powercycle'):
+            (remote,) = (self.ctx.cluster.only('mds.{m}'.format(m=mds)).
+                         remotes.keys())
+            self.log('revive_mds on mds.{m} doing powercycle of {s}'.
+                     format(m=mds, s=remote.name))
+            self._assert_ipmi(remote)
+            remote.console.power_on()
+            self.manager.make_admin_daemon_dir(self.ctx, remote)
+        args = []
+        self.ctx.daemons.get_daemon('mds', mds).restart(*args)
+
+    def wait_for_stable(self, rank = None, gid = None):
+        self.log('waiting for mds cluster to stabilize...')
+        for itercount in itertools.count():
+            status = self.fs.status()
+            max_mds = status.get_fsmap(self.fs.id)['mdsmap']['max_mds']
+            ranks = list(status.get_ranks(self.fs.id))
+            stopping = sum(1 for _ in ranks if "up:stopping" == _['state'])
+            actives = sum(1 for _ in ranks
+                          if "up:active" == _['state'] and "laggy_since" not in _)
+
+            if not bool(self.config.get('thrash_while_stopping', False)) and stopping > 0:
+                if itercount % 5 == 0:
+                    self.log('cluster is considered unstable while MDS are in up:stopping (!thrash_while_stopping)')
+            else:
+                if rank is not None:
+                    try:
+                        info = status.get_rank(self.fs.id, rank)
+                        if info['gid'] != gid and "up:active" == info['state']:
+                            self.log('mds.{name} has gained rank={rank}, replacing gid={gid}'.format(name = info['name'], rank = rank, gid = gid))
+                            return status
+                    except:
+                        pass # no rank present
+                    if actives >= max_mds:
+                        # no replacement can occur!
+                        self.log("cluster has {actives} actives (max_mds is {max_mds}), no MDS can replace rank {rank}".format(
+                            actives=actives, max_mds=max_mds, rank=rank))
+                        return status
+                else:
+                    if actives == max_mds:
+                        self.log('mds cluster has {count} alive and active, now stable!'.format(count = actives))
+                        return status, None
+            if itercount > 300/2: # 5 minutes
+                 raise RuntimeError('timeout waiting for cluster to stabilize')
+            elif itercount % 5 == 0:
+                self.log('mds map: {status}'.format(status=status))
+            else:
+                self.log('no change')
+            sleep(2)
+
+    def do_thrash(self):
+        """
+        Perform the random thrashing action
+        """
+
+        self.log('starting mds_do_thrash for fs {fs}'.format(fs = self.fs.name))
+        stats = {
+            "max_mds": 0,
+            "deactivate": 0,
+            "kill": 0,
+        }
+
+        while not self.stopping.is_set():
+            delay = self.max_thrash_delay
+            if self.randomize:
+                delay = random.randrange(0.0, self.max_thrash_delay)
+
+            if delay > 0.0:
+                self.log('waiting for {delay} secs before thrashing'.format(delay=delay))
+                self.stopping.wait(delay)
+                if self.stopping.is_set():
+                    continue
+
+            status = self.fs.status()
+
+            if random.random() <= self.thrash_max_mds:
+                max_mds = status.get_fsmap(self.fs.id)['mdsmap']['max_mds']
+                options = [i for i in range(1, self.max_mds + 1) if i != max_mds]
+                if len(options) > 0:
+                    new_max_mds = random.choice(options)
+                    self.log('thrashing max_mds: %d -> %d' % (max_mds, new_max_mds))
+                    self.fs.set_max_mds(new_max_mds)
+                    stats['max_mds'] += 1
+                    self.wait_for_stable()
+
+            count = 0
+            for info in status.get_ranks(self.fs.id):
+                name = info['name']
+                label = 'mds.' + name
+                rank = info['rank']
+                gid = info['gid']
+
+                # if thrash_weights isn't specified and we've reached max_thrash,
+                # we're done
+                count = count + 1
+                if 'thrash_weights' not in self.config and count > self.max_thrash:
+                    break
+
+                weight = 1.0
+                if 'thrash_weights' in self.config:
+                    weight = self.config['thrash_weights'].get(label, '0.0')
+                skip = random.randrange(0.0, 1.0)
+                if weight <= skip:
+                    self.log('skipping thrash iteration with skip ({skip}) > weight ({weight})'.format(skip=skip, weight=weight))
+                    continue
+
+                self.log('kill {label} (rank={rank})'.format(label=label, rank=rank))
+                self.kill_mds(name)
+                stats['kill'] += 1
+
+                # wait for mon to report killed mds as crashed
+                last_laggy_since = None
+                itercount = 0
+                while True:
+                    status = self.fs.status()
+                    info = status.get_mds(name)
+                    if not info:
+                        break
+                    if 'laggy_since' in info:
+                        last_laggy_since = info['laggy_since']
+                        break
+                    if any([(f == name) for f in status.get_fsmap(self.fs.id)['mdsmap']['failed']]):
+                        break
+                    self.log(
+                        'waiting till mds map indicates {label} is laggy/crashed, in failed state, or {label} is removed from mdsmap'.format(
+                            label=label))
+                    itercount = itercount + 1
+                    if itercount > 10:
+                        self.log('mds map: {status}'.format(status=status))
+                    sleep(2)
+
+                if last_laggy_since:
+                    self.log(
+                        '{label} reported laggy/crashed since: {since}'.format(label=label, since=last_laggy_since))
+                else:
+                    self.log('{label} down, removed from mdsmap'.format(label=label))
+
+                # wait for a standby mds to takeover and become active
+                status = self.wait_for_stable(rank, gid)
+
+                # wait for a while before restarting old active to become new
+                # standby
+                delay = self.max_revive_delay
+                if self.randomize:
+                    delay = random.randrange(0.0, self.max_revive_delay)
+
+                self.log('waiting for {delay} secs before reviving {label}'.format(
+                    delay=delay, label=label))
+                sleep(delay)
+
+                self.log('reviving {label}'.format(label=label))
+                self.revive_mds(name)
+
+                for itercount in itertools.count():
+                    if itercount > 300/2: # 5 minutes
+                        raise RuntimeError('timeout waiting for MDS to revive')
+                    status = self.fs.status()
+                    info = status.get_mds(name)
+                    if info and info['state'] in ('up:standby', 'up:standby-replay', 'up:active'):
+                        self.log('{label} reported in {state} state'.format(label=label, state=info['state']))
+                        break
+                    self.log(
+                        'waiting till mds map indicates {label} is in active, standby or standby-replay'.format(label=label))
+                    sleep(2)
+
+        for stat in stats:
+            self.log("stat['{key}'] = {value}".format(key = stat, value = stats[stat]))
+
+             # don't do replay thrashing right now
+#            for info in status.get_replays(self.fs.id):
+#                # this might race with replay -> active transition...
+#                if status['state'] == 'up:replay' and random.randrange(0.0, 1.0) < self.thrash_in_replay:
+#                    delay = self.max_replay_thrash_delay
+#                    if self.randomize:
+#                        delay = random.randrange(0.0, self.max_replay_thrash_delay)
+#                sleep(delay)
+#                self.log('kill replaying mds.{id}'.format(id=self.to_kill))
+#                self.kill_mds(self.to_kill)
+#
+#                delay = self.max_revive_delay
+#                if self.randomize:
+#                    delay = random.randrange(0.0, self.max_revive_delay)
+#
+#                self.log('waiting for {delay} secs before reviving mds.{id}'.format(
+#                    delay=delay, id=self.to_kill))
+#                sleep(delay)
+#
+#                self.log('revive mds.{id}'.format(id=self.to_kill))
+#                self.revive_mds(self.to_kill)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Stress test the mds by thrashing while another task/workunit
+    is running.
+
+    Please refer to MDSThrasher class for further information on the
+    available options.
+    """
+
+    mds_cluster = MDSCluster(ctx)
+
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'mds_thrash task only accepts a dict for configuration'
+    mdslist = list(teuthology.all_roles_of_type(ctx.cluster, 'mds'))
+    assert len(mdslist) > 1, \
+        'mds_thrash task requires at least 2 metadata servers'
+
+    # choose random seed
+    if 'seed' in config:
+        seed = int(config['seed'])
+    else:
+        seed = int(time.time())
+    log.info('mds thrasher using random seed: {seed}'.format(seed=seed))
+    random.seed(seed)
+
+    (first,) = ctx.cluster.only('mds.{_id}'.format(_id=mdslist[0])).remotes.keys()
+    manager = ceph_manager.CephManager(
+        first, ctx=ctx, logger=log.getChild('ceph_manager'),
+    )
+
+    # make sure everyone is in active, standby, or standby-replay
+    log.info('Wait for all MDSs to reach steady state...')
+    status = mds_cluster.status()
+    while True:
+        steady = True
+        for info in status.get_all():
+            state = info['state']
+            if state not in ('up:active', 'up:standby', 'up:standby-replay'):
+                steady = False
+                break
+        if steady:
+            break
+        sleep(2)
+        status = mds_cluster.status()
+    log.info('Ready to start thrashing')
+
+    manager.wait_for_clean()
+    assert manager.is_clean()
+
+    if 'cluster' not in config:
+        config['cluster'] = 'ceph'
+
+    for fs in status.get_filesystems():
+        thrasher = MDSThrasher(ctx, manager, config, Filesystem(ctx, fscid=fs['id']), fs['mdsmap']['max_mds'])
+        thrasher.start()
+        ctx.ceph[config['cluster']].thrashers.append(thrasher)
+
+    try:
+        log.debug('Yielding')
+        yield
+    finally:
+        log.info('joining mds_thrasher')
+        thrasher.stop()
+        if thrasher.exception is not None:
+            raise RuntimeError('error during thrashing')
+        thrasher.join()
+        log.info('done joining')
diff --git a/qa/tasks/metadata.yaml b/qa/tasks/metadata.yaml
new file mode 100644
index 000000000..ccdc3b077
--- /dev/null
+++ b/qa/tasks/metadata.yaml
@@ -0,0 +1,2 @@
+instance-id: test
+local-hostname: test
diff --git a/qa/tasks/mgr/__init__.py b/qa/tasks/mgr/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/qa/tasks/mgr/__init__.py
diff --git a/qa/tasks/mgr/dashboard/__init__.py b/qa/tasks/mgr/dashboard/__init__.py
new file mode 100644
index 000000000..2b022e024
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/__init__.py
@@ -0,0 +1 @@
+DEFAULT_API_VERSION = '1.0'
diff --git a/qa/tasks/mgr/dashboard/helper.py b/qa/tasks/mgr/dashboard/helper.py
new file mode 100644
index 000000000..2c6efa901
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/helper.py
@@ -0,0 +1,722 @@
+# -*- coding: utf-8 -*-
+# pylint: disable=W0212,too-many-return-statements,too-many-public-methods
+from __future__ import absolute_import
+
+import json
+import logging
+import random
+import re
+import string
+import time
+from collections import namedtuple
+from typing import List
+
+import requests
+from tasks.mgr.mgr_test_case import MgrTestCase
+from teuthology.exceptions import \
+    CommandFailedError  # pylint: disable=import-error
+
+from . import DEFAULT_API_VERSION
+
+log = logging.getLogger(__name__)
+
+
+class DashboardTestCase(MgrTestCase):
+    # Display full error diffs
+    maxDiff = None
+
+    # Increased x3 (20 -> 60)
+    TIMEOUT_HEALTH_CLEAR = 60
+
+    MGRS_REQUIRED = 2
+    MDSS_REQUIRED = 1
+    REQUIRE_FILESYSTEM = True
+    CLIENTS_REQUIRED = 1
+    CEPHFS = False
+    ORCHESTRATOR = False
+    ORCHESTRATOR_TEST_DATA = {
+        'inventory': [
+            {
+                'name': 'test-host0',
+                'addr': '1.2.3.4',
+                'devices': [
+                    {
+                        'path': '/dev/sda',
+                    }
+                ]
+            },
+            {
+                'name': 'test-host1',
+                'addr': '1.2.3.5',
+                'devices': [
+                    {
+                        'path': '/dev/sdb',
+                    }
+                ]
+            }
+        ],
+        'daemons': [
+            {
+                'nodename': 'test-host0',
+                'daemon_type': 'mon',
+                'daemon_id': 'a'
+            },
+            {
+                'nodename': 'test-host0',
+                'daemon_type': 'mgr',
+                'daemon_id': 'x'
+            },
+            {
+                'nodename': 'test-host0',
+                'daemon_type': 'osd',
+                'daemon_id': '0'
+            },
+            {
+                'nodename': 'test-host1',
+                'daemon_type': 'osd',
+                'daemon_id': '1'
+            }
+        ]
+    }
+
+    _session = None  # type: requests.sessions.Session
+    _token = None
+    _resp = None  # type: requests.models.Response
+    _loggedin = False
+    _base_uri = None
+
+    AUTO_AUTHENTICATE = True
+
+    AUTH_ROLES = ['administrator']
+
+    @classmethod
+    def create_user(cls, username, password, roles=None,
+                    force_password=True, cmd_args=None):
+        # pylint: disable=too-many-arguments
+        """
+        :param username: The name of the user.
+        :type username: str
+        :param password: The password.
+        :type password: str
+        :param roles: A list of roles.
+        :type roles: list
+        :param force_password: Force the use of the specified password. This
+          will bypass the password complexity check. Defaults to 'True'.
+        :type force_password: bool
+        :param cmd_args: Additional command line arguments for the
+          'ac-user-create' command.
+        :type cmd_args: None | list[str]
+        """
+        try:
+            cls._ceph_cmd(['dashboard', 'ac-user-show', username])
+            cls._ceph_cmd(['dashboard', 'ac-user-delete', username])
+        except CommandFailedError as ex:
+            if ex.exitstatus != 2:
+                raise ex
+
+        user_create_args = [
+            'dashboard', 'ac-user-create', username
+        ]
+        if force_password:
+            user_create_args.append('--force-password')
+        if cmd_args:
+            user_create_args.extend(cmd_args)
+        cls._ceph_cmd_with_secret(user_create_args, password)
+        if roles:
+            set_roles_args = ['dashboard', 'ac-user-set-roles', username]
+            for idx, role in enumerate(roles):
+                if isinstance(role, str):
+                    set_roles_args.append(role)
+                else:
+                    assert isinstance(role, dict)
+                    rolename = 'test_role_{}'.format(idx)
+                    try:
+                        cls._ceph_cmd(['dashboard', 'ac-role-show', rolename])
+                        cls._ceph_cmd(['dashboard', 'ac-role-delete', rolename])
+                    except CommandFailedError as ex:
+                        if ex.exitstatus != 2:
+                            raise ex
+                    cls._ceph_cmd(['dashboard', 'ac-role-create', rolename])
+                    for mod, perms in role.items():
+                        args = ['dashboard', 'ac-role-add-scope-perms', rolename, mod]
+                        args.extend(perms)
+                        cls._ceph_cmd(args)
+                    set_roles_args.append(rolename)
+            cls._ceph_cmd(set_roles_args)
+
+    @classmethod
+    def create_pool(cls, name, pg_num, pool_type, application='rbd'):
+        data = {
+            'pool': name,
+            'pg_num': pg_num,
+            'pool_type': pool_type,
+            'application_metadata': [application]
+        }
+        if pool_type == 'erasure':
+            data['flags'] = ['ec_overwrites']
+        cls._task_post("/api/pool", data)
+
+    @classmethod
+    def login(cls, username, password, set_cookies=False):
+        if cls._loggedin:
+            cls.logout()
+        cls._post('/api/auth', {'username': username,
+                                'password': password}, set_cookies=set_cookies)
+        cls._assertEq(cls._resp.status_code, 201)
+        cls._token = cls.jsonBody()['token']
+        cls._loggedin = True
+
+    @classmethod
+    def logout(cls, set_cookies=False):
+        if cls._loggedin:
+            cls._post('/api/auth/logout', set_cookies=set_cookies)
+            cls._assertEq(cls._resp.status_code, 200)
+            cls._token = None
+            cls._loggedin = False
+
+    @classmethod
+    def delete_user(cls, username, roles=None):
+        if roles is None:
+            roles = []
+        cls._ceph_cmd(['dashboard', 'ac-user-delete', username])
+        for idx, role in enumerate(roles):
+            if isinstance(role, dict):
+                cls._ceph_cmd(['dashboard', 'ac-role-delete', 'test_role_{}'.format(idx)])
+
+    @classmethod
+    def RunAs(cls, username, password, roles=None, force_password=True,
+              cmd_args=None, login=True):
+        # pylint: disable=too-many-arguments
+        def wrapper(func):
+            def execute(self, *args, **kwargs):
+                self.create_user(username, password, roles,
+                                 force_password, cmd_args)
+                if login:
+                    self.login(username, password)
+                res = func(self, *args, **kwargs)
+                if login:
+                    self.logout()
+                self.delete_user(username, roles)
+                return res
+
+            return execute
+
+        return wrapper
+
+    @classmethod
+    def set_jwt_token(cls, token):
+        cls._token = token
+
+    @classmethod
+    def setUpClass(cls):
+        super(DashboardTestCase, cls).setUpClass()
+        cls._assign_ports("dashboard", "ssl_server_port")
+        cls._load_module("dashboard")
+        cls.update_base_uri()
+
+        if cls.CEPHFS:
+            cls.mds_cluster.clear_firewall()
+
+            # To avoid any issues with e.g. unlink bugs, we destroy and recreate
+            # the filesystem rather than just doing a rm -rf of files
+            cls.mds_cluster.mds_stop()
+            cls.mds_cluster.mds_fail()
+            cls.mds_cluster.delete_all_filesystems()
+            cls.fs = None  # is now invalid!
+
+            cls.fs = cls.mds_cluster.newfs(create=True)
+            cls.fs.mds_restart()
+
+            # In case some test messed with auth caps, reset them
+            # pylint: disable=not-an-iterable
+            client_mount_ids = [m.client_id for m in cls.mounts]
+            for client_id in client_mount_ids:
+                cls.mds_cluster.mon_manager.raw_cluster_cmd_result(
+                    'auth', 'caps', "client.{0}".format(client_id),
+                    'mds', 'allow',
+                    'mon', 'allow r',
+                    'osd', 'allow rw pool={0}'.format(cls.fs.get_data_pool_name()))
+
+            # wait for mds restart to complete...
+            cls.fs.wait_for_daemons()
+
+        if cls.ORCHESTRATOR:
+            cls._load_module("test_orchestrator")
+
+            cmd = ['orch', 'set', 'backend', 'test_orchestrator']
+            cls.mgr_cluster.mon_manager.raw_cluster_cmd(*cmd)
+
+            cmd = ['test_orchestrator', 'load_data', '-i', '-']
+            cls.mgr_cluster.mon_manager.raw_cluster_cmd_result(*cmd, stdin=json.dumps(
+                cls.ORCHESTRATOR_TEST_DATA
+            ))
+
+        cls._token = None
+        cls._session = requests.Session()
+        cls._resp = None
+
+        cls.create_user('admin', 'admin', cls.AUTH_ROLES)
+        if cls.AUTO_AUTHENTICATE:
+            cls.login('admin', 'admin')
+
+    @classmethod
+    def update_base_uri(cls):
+        if cls._base_uri is None:
+            cls._base_uri = cls._get_uri("dashboard").rstrip('/')
+
+    def setUp(self):
+        super(DashboardTestCase, self).setUp()
+        if not self._loggedin and self.AUTO_AUTHENTICATE:
+            self.login('admin', 'admin')
+        self.wait_for_health_clear(self.TIMEOUT_HEALTH_CLEAR)
+
+    @classmethod
+    def tearDownClass(cls):
+        super(DashboardTestCase, cls).tearDownClass()
+
+    # pylint: disable=inconsistent-return-statements, too-many-arguments, too-many-branches
+    @classmethod
+    def _request(cls, url, method, data=None, params=None, version=DEFAULT_API_VERSION,
+                 set_cookies=False):
+        url = "{}{}".format(cls._base_uri, url)
+        log.debug("Request %s to %s", method, url)
+        headers = {}
+        cookies = {}
+        if cls._token:
+            if set_cookies:
+                cookies['token'] = cls._token
+            else:
+                headers['Authorization'] = "Bearer {}".format(cls._token)
+        if version is None:
+            headers['Accept'] = 'application/json'
+        else:
+            headers['Accept'] = 'application/vnd.ceph.api.v{}+json'.format(version)
+
+        if set_cookies:
+            if method == 'GET':
+                cls._resp = cls._session.get(url, params=params, verify=False,
+                                             headers=headers, cookies=cookies)
+            elif method == 'POST':
+                cls._resp = cls._session.post(url, json=data, params=params,
+                                              verify=False, headers=headers, cookies=cookies)
+            elif method == 'DELETE':
+                cls._resp = cls._session.delete(url, json=data, params=params,
+                                                verify=False, headers=headers, cookies=cookies)
+            elif method == 'PUT':
+                cls._resp = cls._session.put(url, json=data, params=params,
+                                             verify=False, headers=headers, cookies=cookies)
+            else:
+                assert False
+        else:
+            if method == 'GET':
+                cls._resp = cls._session.get(url, params=params, verify=False,
+                                             headers=headers)
+            elif method == 'POST':
+                cls._resp = cls._session.post(url, json=data, params=params,
+                                              verify=False, headers=headers)
+            elif method == 'DELETE':
+                cls._resp = cls._session.delete(url, json=data, params=params,
+                                                verify=False, headers=headers)
+            elif method == 'PUT':
+                cls._resp = cls._session.put(url, json=data, params=params,
+                                             verify=False, headers=headers)
+            else:
+                assert False
+        try:
+            if not cls._resp.ok:
+                # Output response for easier debugging.
+                log.error("Request response: %s", cls._resp.text)
+            content_type = cls._resp.headers['content-type']
+            if re.match(r'^application/.*json',
+                        content_type) and cls._resp.text and cls._resp.text != "":
+                return cls._resp.json()
+            return cls._resp.text
+        except ValueError as ex:
+            log.exception("Failed to decode response: %s", cls._resp.text)
+            raise ex
+
+    @classmethod
+    def _get(cls, url, params=None, version=DEFAULT_API_VERSION, set_cookies=False):
+        return cls._request(url, 'GET', params=params, version=version, set_cookies=set_cookies)
+
+    @classmethod
+    def _view_cache_get(cls, url, retries=5):
+        retry = True
+        while retry and retries > 0:
+            retry = False
+            res = cls._get(url, version=DEFAULT_API_VERSION)
+            if isinstance(res, dict):
+                res = [res]
+            for view in res:
+                assert 'value' in view
+                if not view['value']:
+                    retry = True
+            retries -= 1
+        if retries == 0:
+            raise Exception("{} view cache exceeded number of retries={}"
+                            .format(url, retries))
+        return res
+
+    @classmethod
+    def _post(cls, url, data=None, params=None, version=DEFAULT_API_VERSION, set_cookies=False):
+        cls._request(url, 'POST', data, params, version=version, set_cookies=set_cookies)
+
+    @classmethod
+    def _delete(cls, url, data=None, params=None, version=DEFAULT_API_VERSION, set_cookies=False):
+        cls._request(url, 'DELETE', data, params, version=version, set_cookies=set_cookies)
+
+    @classmethod
+    def _put(cls, url, data=None, params=None, version=DEFAULT_API_VERSION, set_cookies=False):
+        cls._request(url, 'PUT', data, params, version=version, set_cookies=set_cookies)
+
+    @classmethod
+    def _assertEq(cls, v1, v2):
+        if not v1 == v2:
+            raise Exception("assertion failed: {} != {}".format(v1, v2))
+
+    @classmethod
+    def _assertIn(cls, v1, v2):
+        if v1 not in v2:
+            raise Exception("assertion failed: {} not in {}".format(v1, v2))
+
+    @classmethod
+    def _assertIsInst(cls, v1, v2):
+        if not isinstance(v1, v2):
+            raise Exception("assertion failed: {} not instance of {}".format(v1, v2))
+
+    # pylint: disable=too-many-arguments
+    @classmethod
+    def _task_request(cls, method, url, data, timeout, version=DEFAULT_API_VERSION,
+                      set_cookies=False):
+        res = cls._request(url, method, data, version=version, set_cookies=set_cookies)
+        cls._assertIn(cls._resp.status_code, [200, 201, 202, 204, 400, 403, 404])
+
+        if cls._resp.status_code == 403:
+            return None
+
+        if cls._resp.status_code != 202:
+            log.debug("task finished immediately")
+            return res
+
+        cls._assertIn('name', res)
+        cls._assertIn('metadata', res)
+        task_name = res['name']
+        task_metadata = res['metadata']
+
+        retries = int(timeout)
+        res_task = None
+        while retries > 0 and not res_task:
+            retries -= 1
+            log.debug("task (%s, %s) is still executing", task_name, task_metadata)
+            time.sleep(1)
+            _res = cls._get('/api/task?name={}'.format(task_name), version=version)
+            cls._assertEq(cls._resp.status_code, 200)
+            executing_tasks = [task for task in _res['executing_tasks'] if
+                               task['metadata'] == task_metadata]
+            finished_tasks = [task for task in _res['finished_tasks'] if
+                              task['metadata'] == task_metadata]
+            if not executing_tasks and finished_tasks:
+                res_task = finished_tasks[0]
+
+        if retries <= 0:
+            raise Exception("Waiting for task ({}, {}) to finish timed out. {}"
+                            .format(task_name, task_metadata, _res))
+
+        log.debug("task (%s, %s) finished", task_name, task_metadata)
+        if res_task['success']:
+            if method == 'POST':
+                cls._resp.status_code = 201
+            elif method == 'PUT':
+                cls._resp.status_code = 200
+            elif method == 'DELETE':
+                cls._resp.status_code = 204
+            return res_task['ret_value']
+
+        if 'status' in res_task['exception']:
+            cls._resp.status_code = res_task['exception']['status']
+        else:
+            cls._resp.status_code = 500
+        return res_task['exception']
+
+    @classmethod
+    def _task_post(cls, url, data=None, timeout=60, version=DEFAULT_API_VERSION, set_cookies=False):
+        return cls._task_request('POST', url, data, timeout, version=version,
+                                 set_cookies=set_cookies)
+
+    @classmethod
+    def _task_delete(cls, url, timeout=60, version=DEFAULT_API_VERSION, set_cookies=False):
+        return cls._task_request('DELETE', url, None, timeout, version=version,
+                                 set_cookies=set_cookies)
+
+    @classmethod
+    def _task_put(cls, url, data=None, timeout=60, version=DEFAULT_API_VERSION, set_cookies=False):
+        return cls._task_request('PUT', url, data, timeout, version=version,
+                                 set_cookies=set_cookies)
+
+    @classmethod
+    def cookies(cls):
+        return cls._resp.cookies
+
+    @classmethod
+    def jsonBody(cls):
+        return cls._resp.json()
+
+    @classmethod
+    def reset_session(cls):
+        cls._session = requests.Session()
+
+    def assertSubset(self, data, biggerData):
+        for key, value in data.items():
+            self.assertEqual(biggerData[key], value)
+
+    def assertJsonBody(self, data):
+        body = self._resp.json()
+        self.assertEqual(body, data)
+
+    def assertJsonSubset(self, data):
+        self.assertSubset(data, self._resp.json())
+
+    def assertSchema(self, data, schema):
+        try:
+            return _validate_json(data, schema)
+        except _ValError as e:
+            self.assertEqual(data, str(e))
+
+    def assertSchemaBody(self, schema):
+        self.assertSchema(self.jsonBody(), schema)
+
+    def assertBody(self, body):
+        self.assertEqual(self._resp.text, body)
+
+    def assertStatus(self, status):
+        if isinstance(status, list):
+            self.assertIn(self._resp.status_code, status)
+        else:
+            self.assertEqual(self._resp.status_code, status)
+
+    def assertHeaders(self, headers):
+        for name, value in headers.items():
+            self.assertIn(name, self._resp.headers)
+            self.assertEqual(self._resp.headers[name], value)
+
+    def assertError(self, code=None, component=None, detail=None):
+        body = self._resp.json()
+        if code:
+            self.assertEqual(body['code'], code)
+        if component:
+            self.assertEqual(body['component'], component)
+        if detail:
+            self.assertEqual(body['detail'], detail)
+
+    @classmethod
+    def _ceph_cmd(cls, cmd):
+        res = cls.mgr_cluster.mon_manager.raw_cluster_cmd(*cmd)
+        log.debug("command result: %s", res)
+        return res
+
+    @classmethod
+    def _ceph_cmd_result(cls, cmd):
+        exitstatus = cls.mgr_cluster.mon_manager.raw_cluster_cmd_result(*cmd)
+        log.debug("command exit status: %d", exitstatus)
+        return exitstatus
+
+    @classmethod
+    def _ceph_cmd_with_secret(cls, cmd: List[str], secret: str, return_exit_code: bool = False):
+        cmd.append('-i')
+        cmd.append('{}'.format(cls._ceph_create_tmp_file(secret)))
+        if return_exit_code:
+            return cls._ceph_cmd_result(cmd)
+        return cls._ceph_cmd(cmd)
+
+    @classmethod
+    def _ceph_create_tmp_file(cls, content: str) -> str:
+        """Create a temporary file in the remote cluster"""
+        file_name = ''.join(random.choices(string.ascii_letters + string.digits, k=20))
+        file_path = '/tmp/{}'.format(file_name)
+        cls._cmd(['sh', '-c', 'echo -n {} > {}'.format(content, file_path)])
+        return file_path
+
+    def set_config_key(self, key, value):
+        self._ceph_cmd(['config-key', 'set', key, value])
+
+    def get_config_key(self, key):
+        return self._ceph_cmd(['config-key', 'get', key])
+
+    @classmethod
+    def _cmd(cls, args):
+        return cls.mgr_cluster.admin_remote.run(args=args)
+
+    @classmethod
+    def _rbd_cmd(cls, cmd):
+        args = ['rbd']
+        args.extend(cmd)
+        cls._cmd(args)
+
+    @classmethod
+    def _radosgw_admin_cmd(cls, cmd):
+        args = ['radosgw-admin']
+        args.extend(cmd)
+        cls._cmd(args)
+
+    @classmethod
+    def _rados_cmd(cls, cmd):
+        args = ['rados']
+        args.extend(cmd)
+        cls._cmd(args)
+
+    @classmethod
+    def mons(cls):
+        out = cls.ceph_cluster.mon_manager.raw_cluster_cmd('quorum_status')
+        j = json.loads(out)
+        return [mon['name'] for mon in j['monmap']['mons']]
+
+    @classmethod
+    def find_object_in_list(cls, key, value, iterable):
+        """
+        Get the first occurrence of an object within a list with
+        the specified key/value.
+        :param key: The name of the key.
+        :param value: The value to search for.
+        :param iterable: The list to process.
+        :return: Returns the found object or None.
+        """
+        for obj in iterable:
+            if key in obj and obj[key] == value:
+                return obj
+        return None
+
+
+# TODP: pass defaults=(False,) to namedtuple() if python3.7
+class JLeaf(namedtuple('JLeaf', ['typ', 'none'])):
+    def __new__(cls, typ, none=False):
+        return super().__new__(cls, typ, none)
+
+
+JList = namedtuple('JList', ['elem_typ'])
+
+JTuple = namedtuple('JList', ['elem_typs'])
+
+JUnion = namedtuple('JUnion', ['elem_typs'])
+
+
+class JObj(namedtuple('JObj', ['sub_elems', 'allow_unknown', 'none', 'unknown_schema'])):
+    def __new__(cls, sub_elems, allow_unknown=False, none=False, unknown_schema=None):
+        """
+        :type sub_elems: dict[str, JAny | JLeaf | JList | JObj | type]
+        :type allow_unknown: bool
+        :type none: bool
+        :type unknown_schema: int, str, JAny | JLeaf | JList | JObj
+        :return:
+        """
+        return super(JObj, cls).__new__(cls, sub_elems, allow_unknown, none, unknown_schema)
+
+
+JAny = namedtuple('JAny', ['none'])
+
+module_options_object_schema = JObj({
+    'name': str,
+    'type': str,
+    'level': str,
+    'flags': int,
+    'default_value': JAny(none=True),
+    'min': JAny(none=False),
+    'max': JAny(none=False),
+    'enum_allowed': JList(str),
+    'see_also': JList(str),
+    'desc': str,
+    'long_desc': str,
+    'tags': JList(str),
+})
+
+module_options_schema = JObj(
+    {},
+    allow_unknown=True,
+    unknown_schema=module_options_object_schema)
+
+addrvec_schema = JList(JObj({
+    'addr': str,
+    'nonce': int,
+    'type': str
+}))
+
+devices_schema = JList(JObj({
+    'daemons': JList(str),
+    'devid': str,
+    'location': JList(JObj({
+        'host': str,
+        'dev': str,
+        'path': str
+    }))
+}, allow_unknown=True))
+
+
+class _ValError(Exception):
+    def __init__(self, msg, path):
+        path_str = ''.join('[{}]'.format(repr(p)) for p in path)
+        super(_ValError, self).__init__('In `input{}`: {}'.format(path_str, msg))
+
+
+# pylint: disable=dangerous-default-value,inconsistent-return-statements,too-many-branches
+def _validate_json(val, schema, path=[]):
+    """
+    >>> d = {'a': 1, 'b': 'x', 'c': range(10)}
+    ... ds = JObj({'a': int, 'b': str, 'c': JList(int)})
+    ... _validate_json(d, ds)
+    True
+    >>> _validate_json({'num': 1}, JObj({'num': JUnion([int,float])}))
+    True
+    >>> _validate_json({'num': 'a'}, JObj({'num': JUnion([int,float])}))
+    False
+    """
+    if isinstance(schema, JAny):
+        if not schema.none and val is None:
+            raise _ValError('val is None', path)
+        return True
+    if isinstance(schema, JLeaf):
+        if schema.none and val is None:
+            return True
+        if not isinstance(val, schema.typ):
+            raise _ValError('val not of type {}'.format(schema.typ), path)
+        return True
+    if isinstance(schema, JList):
+        if not isinstance(val, list):
+            raise _ValError('val="{}" is not a list'.format(val), path)
+        return all(_validate_json(e, schema.elem_typ, path + [i]) for i, e in enumerate(val))
+    if isinstance(schema, JTuple):
+        return all(_validate_json(val[i], typ, path + [i])
+                   for i, typ in enumerate(schema.elem_typs))
+    if isinstance(schema, JUnion):
+        for typ in schema.elem_typs:
+            try:
+                if _validate_json(val, typ, path):
+                    return True
+            except _ValError:
+                pass
+        return False
+    if isinstance(schema, JObj):
+        if val is None and schema.none:
+            return True
+        if val is None:
+            raise _ValError('val is None', path)
+        if not hasattr(val, 'keys'):
+            raise _ValError('val="{}" is not a dict'.format(val), path)
+        missing_keys = set(schema.sub_elems.keys()).difference(set(val.keys()))
+        if missing_keys:
+            raise _ValError('missing keys: {}'.format(missing_keys), path)
+        unknown_keys = set(val.keys()).difference(set(schema.sub_elems.keys()))
+        if not schema.allow_unknown and unknown_keys:
+            raise _ValError('unknown keys: {}'.format(unknown_keys), path)
+        result = all(
+            _validate_json(val[key], sub_schema, path + [key])
+            for key, sub_schema in schema.sub_elems.items()
+        )
+        if unknown_keys and schema.allow_unknown and schema.unknown_schema:
+            result += all(
+                _validate_json(val[key], schema.unknown_schema, path + [key])
+                for key in unknown_keys
+            )
+        return result
+    if schema in [str, int, float, bool]:
+        return _validate_json(val, JLeaf(schema), path)
+
+    assert False, str(path)
diff --git a/qa/tasks/mgr/dashboard/test_api.py b/qa/tasks/mgr/dashboard/test_api.py
new file mode 100644
index 000000000..22f235698
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_api.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import
+
+import unittest
+
+from . import DEFAULT_API_VERSION
+from .helper import DashboardTestCase
+
+
+class VersionReqTest(DashboardTestCase, unittest.TestCase):
+    def test_version(self):
+        for (version, expected_status) in [
+                (DEFAULT_API_VERSION, 200),
+                (None, 415),
+                ("99.99", 415)
+        ]:
+            with self.subTest(version=version):
+                self._get('/api/summary', version=version)
+                self.assertStatus(expected_status)
diff --git a/qa/tasks/mgr/dashboard/test_auth.py b/qa/tasks/mgr/dashboard/test_auth.py
new file mode 100644
index 000000000..a2266229b
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_auth.py
@@ -0,0 +1,352 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import
+
+import time
+
+import jwt
+from teuthology.orchestra.run import \
+    CommandFailedError  # pylint: disable=import-error
+
+from .helper import DashboardTestCase, JLeaf, JObj
+
+
+class AuthTest(DashboardTestCase):
+
+    AUTO_AUTHENTICATE = False
+
+    def setUp(self):
+        super(AuthTest, self).setUp()
+        self.reset_session()
+
+    def _validate_jwt_token(self, token, username, permissions):
+        payload = jwt.decode(token, options={'verify_signature': False})
+        self.assertIn('username', payload)
+        self.assertEqual(payload['username'], username)
+
+        for scope, perms in permissions.items():
+            self.assertIsNotNone(scope)
+            self.assertIn('read', perms)
+            self.assertIn('update', perms)
+            self.assertIn('create', perms)
+            self.assertIn('delete', perms)
+
+    def test_login_without_password(self):
+        with self.assertRaises(CommandFailedError):
+            self.create_user('admin2', '', ['administrator'], force_password=True)
+
+    def test_a_set_login_credentials(self):
+        # test with Authorization header
+        self.create_user('admin2', 'admin2', ['administrator'])
+        self._post("/api/auth", {'username': 'admin2', 'password': 'admin2'})
+        self.assertStatus(201)
+        data = self.jsonBody()
+        self._validate_jwt_token(data['token'], "admin2", data['permissions'])
+        self.delete_user('admin2')
+
+        # test with Cookies set
+        self.create_user('admin2', 'admin2', ['administrator'])
+        self._post("/api/auth", {'username': 'admin2', 'password': 'admin2'}, set_cookies=True)
+        self.assertStatus(201)
+        data = self.jsonBody()
+        self._validate_jwt_token(data['token'], "admin2", data['permissions'])
+        self.delete_user('admin2')
+
+    def test_login_valid(self):
+        # test with Authorization header
+        self._post("/api/auth", {'username': 'admin', 'password': 'admin'})
+        self.assertStatus(201)
+        data = self.jsonBody()
+        self.assertSchema(data, JObj(sub_elems={
+            'token': JLeaf(str),
+            'username': JLeaf(str),
+            'permissions': JObj(sub_elems={}, allow_unknown=True),
+            'sso': JLeaf(bool),
+            'pwdExpirationDate': JLeaf(int, none=True),
+            'pwdUpdateRequired': JLeaf(bool)
+        }, allow_unknown=False))
+        self._validate_jwt_token(data['token'], "admin", data['permissions'])
+
+        # test with Cookies set
+        self._post("/api/auth", {'username': 'admin', 'password': 'admin'}, set_cookies=True)
+        self.assertStatus(201)
+        data = self.jsonBody()
+        self.assertSchema(data, JObj(sub_elems={
+            'token': JLeaf(str),
+            'username': JLeaf(str),
+            'permissions': JObj(sub_elems={}, allow_unknown=True),
+            'sso': JLeaf(bool),
+            'pwdExpirationDate': JLeaf(int, none=True),
+            'pwdUpdateRequired': JLeaf(bool)
+        }, allow_unknown=False))
+        self._validate_jwt_token(data['token'], "admin", data['permissions'])
+
+    def test_login_invalid(self):
+        # test with Authorization header
+        self._post("/api/auth", {'username': 'admin', 'password': 'inval'})
+        self.assertStatus(400)
+        self.assertJsonBody({
+            "component": "auth",
+            "code": "invalid_credentials",
+            "detail": "Invalid credentials"
+        })
+
+    def test_lockout_user(self):
+        # test with Authorization header
+        self._ceph_cmd(['dashboard', 'set-account-lockout-attempts', '3'])
+        for _ in range(3):
+            self._post("/api/auth", {'username': 'admin', 'password': 'inval'})
+        self._post("/api/auth", {'username': 'admin', 'password': 'admin'})
+        self.assertStatus(400)
+        self.assertJsonBody({
+            "component": "auth",
+            "code": "invalid_credentials",
+            "detail": "Invalid credentials"
+        })
+        self._ceph_cmd(['dashboard', 'ac-user-enable', 'admin'])
+        self._post("/api/auth", {'username': 'admin', 'password': 'admin'})
+        self.assertStatus(201)
+        data = self.jsonBody()
+        self.assertSchema(data, JObj(sub_elems={
+            'token': JLeaf(str),
+            'username': JLeaf(str),
+            'permissions': JObj(sub_elems={}, allow_unknown=True),
+            'sso': JLeaf(bool),
+            'pwdExpirationDate': JLeaf(int, none=True),
+            'pwdUpdateRequired': JLeaf(bool)
+        }, allow_unknown=False))
+        self._validate_jwt_token(data['token'], "admin", data['permissions'])
+
+        # test with Cookies set
+        self._ceph_cmd(['dashboard', 'set-account-lockout-attempts', '3'])
+        for _ in range(3):
+            self._post("/api/auth", {'username': 'admin', 'password': 'inval'}, set_cookies=True)
+        self._post("/api/auth", {'username': 'admin', 'password': 'admin'}, set_cookies=True)
+        self.assertStatus(400)
+        self.assertJsonBody({
+            "component": "auth",
+            "code": "invalid_credentials",
+            "detail": "Invalid credentials"
+        })
+        self._ceph_cmd(['dashboard', 'ac-user-enable', 'admin'])
+        self._post("/api/auth", {'username': 'admin', 'password': 'admin'}, set_cookies=True)
+        self.assertStatus(201)
+        data = self.jsonBody()
+        self.assertSchema(data, JObj(sub_elems={
+            'token': JLeaf(str),
+            'username': JLeaf(str),
+            'permissions': JObj(sub_elems={}, allow_unknown=True),
+            'sso': JLeaf(bool),
+            'pwdExpirationDate': JLeaf(int, none=True),
+            'pwdUpdateRequired': JLeaf(bool)
+        }, allow_unknown=False))
+        self._validate_jwt_token(data['token'], "admin", data['permissions'])
+
+    def test_logout(self):
+        # test with Authorization header
+        self._post("/api/auth", {'username': 'admin', 'password': 'admin'})
+        self.assertStatus(201)
+        data = self.jsonBody()
+        self._validate_jwt_token(data['token'], "admin", data['permissions'])
+        self.set_jwt_token(data['token'])
+        self._post("/api/auth/logout")
+        self.assertStatus(200)
+        self.assertJsonBody({
+            "redirect_url": "#/login"
+        })
+        self._get("/api/host", version='1.1')
+        self.assertStatus(401)
+        self.set_jwt_token(None)
+
+        # test with Cookies set
+        self._post("/api/auth", {'username': 'admin', 'password': 'admin'}, set_cookies=True)
+        self.assertStatus(201)
+        data = self.jsonBody()
+        self._validate_jwt_token(data['token'], "admin", data['permissions'])
+        self.set_jwt_token(data['token'])
+        self._post("/api/auth/logout", set_cookies=True)
+        self.assertStatus(200)
+        self.assertJsonBody({
+            "redirect_url": "#/login"
+        })
+        self._get("/api/host", set_cookies=True, version='1.1')
+        self.assertStatus(401)
+        self.set_jwt_token(None)
+
+    def test_token_ttl(self):
+        # test with Authorization header
+        self._ceph_cmd(['dashboard', 'set-jwt-token-ttl', '5'])
+        self._post("/api/auth", {'username': 'admin', 'password': 'admin'})
+        self.assertStatus(201)
+        self.set_jwt_token(self.jsonBody()['token'])
+        self._get("/api/host", version='1.1')
+        self.assertStatus(200)
+        time.sleep(6)
+        self._get("/api/host", version='1.1')
+        self.assertStatus(401)
+        self._ceph_cmd(['dashboard', 'set-jwt-token-ttl', '28800'])
+        self.set_jwt_token(None)
+
+        # test with Cookies set
+        self._ceph_cmd(['dashboard', 'set-jwt-token-ttl', '5'])
+        self._post("/api/auth", {'username': 'admin', 'password': 'admin'}, set_cookies=True)
+        self.assertStatus(201)
+        self.set_jwt_token(self.jsonBody()['token'])
+        self._get("/api/host", set_cookies=True, version='1.1')
+        self.assertStatus(200)
+        time.sleep(6)
+        self._get("/api/host", set_cookies=True, version='1.1')
+        self.assertStatus(401)
+        self._ceph_cmd(['dashboard', 'set-jwt-token-ttl', '28800'])
+        self.set_jwt_token(None)
+
+    def test_remove_from_blocklist(self):
+        # test with Authorization header
+        self._ceph_cmd(['dashboard', 'set-jwt-token-ttl', '5'])
+        self._post("/api/auth", {'username': 'admin', 'password': 'admin'})
+        self.assertStatus(201)
+        self.set_jwt_token(self.jsonBody()['token'])
+        # the following call adds the token to the blocklist
+        self._post("/api/auth/logout")
+        self.assertStatus(200)
+        self._get("/api/host", version='1.1')
+        self.assertStatus(401)
+        time.sleep(6)
+        self._ceph_cmd(['dashboard', 'set-jwt-token-ttl', '28800'])
+        self.set_jwt_token(None)
+        self._post("/api/auth", {'username': 'admin', 'password': 'admin'})
+        self.assertStatus(201)
+        self.set_jwt_token(self.jsonBody()['token'])
+        # the following call removes expired tokens from the blocklist
+        self._post("/api/auth/logout")
+        self.assertStatus(200)
+
+        # test with Cookies set
+        self._ceph_cmd(['dashboard', 'set-jwt-token-ttl', '5'])
+        self._post("/api/auth", {'username': 'admin', 'password': 'admin'}, set_cookies=True)
+        self.assertStatus(201)
+        self.set_jwt_token(self.jsonBody()['token'])
+        # the following call adds the token to the blocklist
+        self._post("/api/auth/logout", set_cookies=True)
+        self.assertStatus(200)
+        self._get("/api/host", set_cookies=True, version='1.1')
+        self.assertStatus(401)
+        time.sleep(6)
+        self._ceph_cmd(['dashboard', 'set-jwt-token-ttl', '28800'])
+        self.set_jwt_token(None)
+        self._post("/api/auth", {'username': 'admin', 'password': 'admin'}, set_cookies=True)
+        self.assertStatus(201)
+        self.set_jwt_token(self.jsonBody()['token'])
+        # the following call removes expired tokens from the blocklist
+        self._post("/api/auth/logout", set_cookies=True)
+        self.assertStatus(200)
+
+    def test_unauthorized(self):
+        # test with Authorization header
+        self._get("/api/host", version='1.1')
+        self.assertStatus(401)
+
+        # test with Cookies set
+        self._get("/api/host", set_cookies=True, version='1.1')
+        self.assertStatus(401)
+
+    def test_invalidate_token_by_admin(self):
+        # test with Authorization header
+        self._get("/api/host", version='1.1')
+        self.assertStatus(401)
+        self.create_user('user', 'user', ['read-only'])
+        time.sleep(1)
+        self._post("/api/auth", {'username': 'user', 'password': 'user'})
+        self.assertStatus(201)
+        self.set_jwt_token(self.jsonBody()['token'])
+        self._get("/api/host", version='1.1')
+        self.assertStatus(200)
+        time.sleep(1)
+        self._ceph_cmd_with_secret(['dashboard', 'ac-user-set-password', '--force-password',
+                                    'user'],
+                                   'user2')
+        time.sleep(1)
+        self._get("/api/host", version='1.1')
+        self.assertStatus(401)
+        self.set_jwt_token(None)
+        self._post("/api/auth", {'username': 'user', 'password': 'user2'})
+        self.assertStatus(201)
+        self.set_jwt_token(self.jsonBody()['token'])
+        self._get("/api/host", version='1.1')
+        self.assertStatus(200)
+        self.delete_user("user")
+
+        # test with Cookies set
+        self._get("/api/host", set_cookies=True, version='1.1')
+        self.assertStatus(401)
+        self.create_user('user', 'user', ['read-only'])
+        time.sleep(1)
+        self._post("/api/auth", {'username': 'user', 'password': 'user'}, set_cookies=True)
+        self.assertStatus(201)
+        self.set_jwt_token(self.jsonBody()['token'])
+        self._get("/api/host", set_cookies=True, version='1.1')
+        self.assertStatus(200)
+        time.sleep(1)
+        self._ceph_cmd_with_secret(['dashboard', 'ac-user-set-password', '--force-password',
+                                    'user'],
+                                   'user2')
+        time.sleep(1)
+        self._get("/api/host", set_cookies=True, version='1.1')
+        self.assertStatus(401)
+        self.set_jwt_token(None)
+        self._post("/api/auth", {'username': 'user', 'password': 'user2'}, set_cookies=True)
+        self.assertStatus(201)
+        self.set_jwt_token(self.jsonBody()['token'])
+        self._get("/api/host", set_cookies=True, version='1.1')
+        self.assertStatus(200)
+        self.delete_user("user")
+
+    def test_check_token(self):
+        # test with Authorization header
+        self.login("admin", "admin")
+        self._post("/api/auth/check", {"token": self.jsonBody()["token"]})
+        self.assertStatus(200)
+        data = self.jsonBody()
+        self.assertSchema(data, JObj(sub_elems={
+            "username": JLeaf(str),
+            "permissions": JObj(sub_elems={}, allow_unknown=True),
+            "sso": JLeaf(bool),
+            "pwdUpdateRequired": JLeaf(bool)
+        }, allow_unknown=False))
+        self.logout()
+
+        # test with Cookies set
+        self.login("admin", "admin", set_cookies=True)
+        self._post("/api/auth/check", {"token": self.jsonBody()["token"]}, set_cookies=True)
+        self.assertStatus(200)
+        data = self.jsonBody()
+        self.assertSchema(data, JObj(sub_elems={
+            "username": JLeaf(str),
+            "permissions": JObj(sub_elems={}, allow_unknown=True),
+            "sso": JLeaf(bool),
+            "pwdUpdateRequired": JLeaf(bool)
+        }, allow_unknown=False))
+        self.logout(set_cookies=True)
+
+    def test_check_wo_token(self):
+        # test with Authorization header
+        self.login("admin", "admin")
+        self._post("/api/auth/check", {"token": ""})
+        self.assertStatus(200)
+        data = self.jsonBody()
+        self.assertSchema(data, JObj(sub_elems={
+            "login_url": JLeaf(str),
+            "cluster_status": JLeaf(str)
+        }, allow_unknown=False))
+        self.logout()
+
+        # test with Cookies set
+        self.login("admin", "admin", set_cookies=True)
+        self._post("/api/auth/check", {"token": ""}, set_cookies=True)
+        self.assertStatus(200)
+        data = self.jsonBody()
+        self.assertSchema(data, JObj(sub_elems={
+            "login_url": JLeaf(str),
+            "cluster_status": JLeaf(str)
+        }, allow_unknown=False))
+        self.logout(set_cookies=True)
diff --git a/qa/tasks/mgr/dashboard/test_cephfs.py b/qa/tasks/mgr/dashboard/test_cephfs.py
new file mode 100644
index 000000000..4295b580f
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_cephfs.py
@@ -0,0 +1,292 @@
+# -*- coding: utf-8 -*-
+# pylint: disable=too-many-public-methods
+
+from contextlib import contextmanager
+
+from .helper import DashboardTestCase, JLeaf, JList, JObj
+
+
+class CephfsTest(DashboardTestCase):
+    CEPHFS = True
+
+    AUTH_ROLES = ['cephfs-manager']
+
+    QUOTA_PATH = '/quotas'
+
+    def assertToHave(self, data, key):
+        self.assertIn(key, data)
+        self.assertIsNotNone(data[key])
+
+    def get_fs_id(self):
+        return self.fs.get_namespace_id()
+
+    def mk_dirs(self, path, expectedStatus=200):
+        self._post("/api/cephfs/{}/tree".format(self.get_fs_id()),
+                   params={'path': path})
+        self.assertStatus(expectedStatus)
+
+    def rm_dir(self, path, expectedStatus=200):
+        self._delete("/api/cephfs/{}/tree".format(self.get_fs_id()),
+                     params={'path': path})
+        self.assertStatus(expectedStatus)
+
+    def get_root_directory(self, expectedStatus=200):
+        data = self._get("/api/cephfs/{}/get_root_directory".format(self.get_fs_id()))
+        self.assertStatus(expectedStatus)
+        self.assertIsInstance(data, dict)
+        return data
+
+    def ls_dir(self, path, expectedLength, depth=None):
+        return self._ls_dir(path, expectedLength, depth, "api")
+
+    def ui_ls_dir(self, path, expectedLength, depth=None):
+        return self._ls_dir(path, expectedLength, depth, "ui-api")
+
+    def _ls_dir(self, path, expectedLength, depth, baseApiPath):
+        params = {'path': path}
+        if depth is not None:
+            params['depth'] = depth
+        data = self._get("/{}/cephfs/{}/ls_dir".format(baseApiPath, self.get_fs_id()),
+                         params=params)
+        self.assertStatus(200)
+        self.assertIsInstance(data, list)
+        self.assertEqual(len(data), expectedLength)
+        return data
+
+    def set_quotas(self, max_bytes=None, max_files=None):
+        quotas = {
+            'max_bytes': max_bytes,
+            'max_files': max_files
+        }
+        self._put("/api/cephfs/{}/quota".format(self.get_fs_id()), data=quotas,
+                  params={'path': self.QUOTA_PATH})
+        self.assertStatus(200)
+
+    def assert_quotas(self, max_bytes, files):
+        data = self.ls_dir('/', 1)[0]
+        self.assertEqual(data['quotas']['max_bytes'], max_bytes)
+        self.assertEqual(data['quotas']['max_files'], files)
+
+    @contextmanager
+    def new_quota_dir(self):
+        self.mk_dirs(self.QUOTA_PATH)
+        self.set_quotas(1024 ** 3, 1024)
+        yield 1
+        self.rm_dir(self.QUOTA_PATH)
+
+    @DashboardTestCase.RunAs('test', 'test', ['block-manager'])
+    def test_access_permissions(self):
+        fs_id = self.get_fs_id()
+        self._get("/api/cephfs/{}/clients".format(fs_id))
+        self.assertStatus(403)
+        self._get("/api/cephfs/{}".format(fs_id))
+        self.assertStatus(403)
+        self._get("/api/cephfs/{}/mds_counters".format(fs_id))
+        self.assertStatus(403)
+        self._get("/ui-api/cephfs/{}/tabs".format(fs_id))
+        self.assertStatus(403)
+
+    def test_cephfs_clients(self):
+        fs_id = self.get_fs_id()
+        data = self._get("/api/cephfs/{}/clients".format(fs_id))
+        self.assertStatus(200)
+
+        self.assertIn('status', data)
+        self.assertIn('data', data)
+
+    def test_cephfs_evict_client_does_not_exist(self):
+        fs_id = self.get_fs_id()
+        self._delete("/api/cephfs/{}/client/1234".format(fs_id))
+        self.assertStatus(404)
+
+    def test_cephfs_evict_invalid_client_id(self):
+        fs_id = self.get_fs_id()
+        self._delete("/api/cephfs/{}/client/xyz".format(fs_id))
+        self.assertStatus(400)
+        self.assertJsonBody({
+            "component": 'cephfs',
+            "code": "invalid_cephfs_client_id",
+            "detail": "Invalid cephfs client ID xyz"
+        })
+
+    def test_cephfs_get(self):
+        fs_id = self.get_fs_id()
+        data = self._get("/api/cephfs/{}/".format(fs_id))
+        self.assertStatus(200)
+
+        self.assertToHave(data, 'cephfs')
+        self.assertToHave(data, 'standbys')
+        self.assertToHave(data, 'versions')
+
+    def test_cephfs_mds_counters(self):
+        fs_id = self.get_fs_id()
+        data = self._get("/api/cephfs/{}/mds_counters".format(fs_id))
+        self.assertStatus(200)
+
+        self.assertIsInstance(data, dict)
+        self.assertIsNotNone(data)
+
+    def test_cephfs_mds_counters_wrong(self):
+        self._get("/api/cephfs/baadbaad/mds_counters")
+        self.assertStatus(400)
+        self.assertJsonBody({
+            "component": 'cephfs',
+            "code": "invalid_cephfs_id",
+            "detail": "Invalid cephfs ID baadbaad"
+        })
+
+    def test_cephfs_list(self):
+        data = self._get("/api/cephfs/")
+        self.assertStatus(200)
+
+        self.assertIsInstance(data, list)
+        cephfs = data[0]
+        self.assertToHave(cephfs, 'id')
+        self.assertToHave(cephfs, 'mdsmap')
+
+    def test_cephfs_get_quotas(self):
+        fs_id = self.get_fs_id()
+        data = self._get("/api/cephfs/{}/quota?path=/".format(fs_id))
+        self.assertStatus(200)
+        self.assertSchema(data, JObj({
+            'max_bytes': int,
+            'max_files': int
+        }))
+
+    def test_cephfs_tabs(self):
+        fs_id = self.get_fs_id()
+        data = self._get("/ui-api/cephfs/{}/tabs".format(fs_id))
+        self.assertStatus(200)
+        self.assertIsInstance(data, dict)
+
+        # Pools
+        pools = data['pools']
+        self.assertIsInstance(pools, list)
+        self.assertGreater(len(pools), 0)
+        for pool in pools:
+            self.assertEqual(pool['size'], pool['used'] + pool['avail'])
+
+        # Ranks
+        self.assertToHave(data, 'ranks')
+        self.assertIsInstance(data['ranks'], list)
+
+        # Name
+        self.assertToHave(data, 'name')
+        self.assertIsInstance(data['name'], str)
+
+        # Standbys
+        self.assertToHave(data, 'standbys')
+        self.assertIsInstance(data['standbys'], str)
+
+        # MDS counters
+        counters = data['mds_counters']
+        self.assertIsInstance(counters, dict)
+        self.assertGreater(len(counters.keys()), 0)
+        for k, v in counters.items():
+            self.assertEqual(v['name'], k)
+
+        # Clients
+        self.assertToHave(data, 'clients')
+        clients = data['clients']
+        self.assertToHave(clients, 'data')
+        self.assertIsInstance(clients['data'], list)
+        self.assertToHave(clients, 'status')
+        self.assertIsInstance(clients['status'], int)
+
+    def test_ls_mk_rm_dir(self):
+        self.ls_dir('/', 0)
+
+        self.mk_dirs('/pictures/birds')
+        self.ls_dir('/', 2, 3)
+        self.ls_dir('/pictures', 1)
+
+        self.rm_dir('/pictures', 500)
+        self.rm_dir('/pictures/birds')
+        self.rm_dir('/pictures')
+
+        self.ls_dir('/', 0)
+
+    def test_snapshots(self):
+        fs_id = self.get_fs_id()
+        self.mk_dirs('/movies/dune/extended_version')
+
+        self._post("/api/cephfs/{}/snapshot".format(fs_id),
+                   params={'path': '/movies/dune', 'name': 'test'})
+        self.assertStatus(200)
+
+        data = self.ls_dir('/movies', 1)
+        self.assertSchema(data[0], JObj(sub_elems={
+            'name': JLeaf(str),
+            'path': JLeaf(str),
+            'parent': JLeaf(str),
+            'snapshots': JList(JObj(sub_elems={
+                'name': JLeaf(str),
+                'path': JLeaf(str),
+                'created': JLeaf(str)
+            })),
+            'quotas': JObj(sub_elems={
+                'max_bytes': JLeaf(int),
+                'max_files': JLeaf(int)
+            })
+        }))
+        snapshots = data[0]['snapshots']
+        self.assertEqual(len(snapshots), 1)
+        snapshot = snapshots[0]
+        self.assertEqual(snapshot['name'], "test")
+        self.assertEqual(snapshot['path'], "/movies/dune/.snap/test")
+
+        # Should have filtered out "_test_$timestamp"
+        data = self.ls_dir('/movies/dune', 1)
+        snapshots = data[0]['snapshots']
+        self.assertEqual(len(snapshots), 0)
+
+        self._delete("/api/cephfs/{}/snapshot".format(fs_id),
+                     params={'path': '/movies/dune', 'name': 'test'})
+        self.assertStatus(200)
+
+        data = self.ls_dir('/movies', 1)
+        self.assertEqual(len(data[0]['snapshots']), 0)
+
+        # Cleanup. Note, the CephFS Python extension (and therefor the Dashboard
+        # REST API) does not support recursive deletion of a directory.
+        self.rm_dir('/movies/dune/extended_version')
+        self.rm_dir('/movies/dune')
+        self.rm_dir('/movies')
+
+    def test_quotas_default(self):
+        self.mk_dirs(self.QUOTA_PATH)
+        self.assert_quotas(0, 0)
+        self.rm_dir(self.QUOTA_PATH)
+
+    def test_quotas_set_both(self):
+        with self.new_quota_dir():
+            self.assert_quotas(1024 ** 3, 1024)
+
+    def test_quotas_set_only_bytes(self):
+        with self.new_quota_dir():
+            self.set_quotas(2048 ** 3)
+            self.assert_quotas(2048 ** 3, 1024)
+
+    def test_quotas_set_only_files(self):
+        with self.new_quota_dir():
+            self.set_quotas(None, 2048)
+            self.assert_quotas(1024 ** 3, 2048)
+
+    def test_quotas_unset_both(self):
+        with self.new_quota_dir():
+            self.set_quotas(0, 0)
+            self.assert_quotas(0, 0)
+
+    def test_listing_of_root_dir(self):
+        self.ls_dir('/', 0)  # Should not list root
+        ui_root = self.ui_ls_dir('/', 1)[0]  # Should list root by default
+        root = self.get_root_directory()
+        self.assertEqual(ui_root, root)
+
+    def test_listing_of_ui_api_ls_on_deeper_levels(self):
+        # The UI-API and API ls_dir methods should behave the same way on deeper levels
+        self.mk_dirs('/pictures')
+        api_ls = self.ls_dir('/pictures', 0)
+        ui_api_ls = self.ui_ls_dir('/pictures', 0)
+        self.assertEqual(api_ls, ui_api_ls)
+        self.rm_dir('/pictures')
diff --git a/qa/tasks/mgr/dashboard/test_cluster.py b/qa/tasks/mgr/dashboard/test_cluster.py
new file mode 100644
index 000000000..14f854279
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_cluster.py
@@ -0,0 +1,23 @@
+from .helper import DashboardTestCase, JLeaf, JObj
+
+
+class ClusterTest(DashboardTestCase):
+
+    def setUp(self):
+        super().setUp()
+        self.reset_session()
+
+    def test_get_status(self):
+        data = self._get('/api/cluster', version='0.1')
+        self.assertStatus(200)
+        self.assertSchema(data, JObj(sub_elems={
+            "status": JLeaf(str)
+        }, allow_unknown=False))
+
+    def test_update_status(self):
+        req = {'status': 'POST_INSTALLED'}
+        self._put('/api/cluster', req, version='0.1')
+        self.assertStatus(200)
+        data = self._get('/api/cluster', version='0.1')
+        self.assertStatus(200)
+        self.assertEqual(data, req)
diff --git a/qa/tasks/mgr/dashboard/test_cluster_configuration.py b/qa/tasks/mgr/dashboard/test_cluster_configuration.py
new file mode 100644
index 000000000..9c8245d23
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_cluster_configuration.py
@@ -0,0 +1,398 @@
+from __future__ import absolute_import
+
+from .helper import DashboardTestCase
+
+
+class ClusterConfigurationTest(DashboardTestCase):
+
+    def test_list(self):
+        data = self._get('/api/cluster_conf')
+        self.assertStatus(200)
+        self.assertIsInstance(data, list)
+        self.assertGreater(len(data), 1000)
+        for conf in data:
+            self._validate_single(conf)
+
+    def test_get(self):
+        data = self._get('/api/cluster_conf/admin_socket')
+        self.assertStatus(200)
+        self._validate_single(data)
+        self.assertIn('enum_values', data)
+
+        data = self._get('/api/cluster_conf/fantasy_name')
+        self.assertStatus(404)
+
+    def test_get_specific_db_config_option(self):
+        config_name = 'mon_allow_pool_delete'
+
+        orig_value = self._get_config_by_name(config_name)
+
+        self._ceph_cmd(['config', 'set', 'mon', config_name, 'true'])
+        self.wait_until_equal(
+            lambda: self._get_config_by_name(config_name),
+            [{'section': 'mon', 'value': 'true'}],
+            timeout=30,
+            period=1)
+
+        self._ceph_cmd(['config', 'set', 'mon', config_name, 'false'])
+        self.wait_until_equal(
+            lambda: self._get_config_by_name(config_name),
+            [{'section': 'mon', 'value': 'false'}],
+            timeout=30,
+            period=1)
+
+        # restore value
+        if orig_value:
+            self._ceph_cmd(['config', 'set', 'mon', config_name, orig_value[0]['value']])
+
+    def test_filter_config_options(self):
+        config_names = ['osd_scrub_during_recovery', 'osd_scrub_begin_hour', 'osd_scrub_end_hour']
+        data = self._get('/api/cluster_conf/filter?names={}'.format(','.join(config_names)))
+        self.assertStatus(200)
+        self.assertIsInstance(data, list)
+        self.assertEqual(len(data), 3)
+        for conf in data:
+            self._validate_single(conf)
+            self.assertIn(conf['name'], config_names)
+
+    def test_filter_config_options_empty_names(self):
+        self._get('/api/cluster_conf/filter?names=')
+        self.assertStatus(404)
+        self.assertEqual(self._resp.json()['detail'], 'Config options `` not found')
+
+    def test_filter_config_options_unknown_name(self):
+        self._get('/api/cluster_conf/filter?names=abc')
+        self.assertStatus(404)
+        self.assertEqual(self._resp.json()['detail'], 'Config options `abc` not found')
+
+    def test_filter_config_options_contains_unknown_name(self):
+        config_names = ['osd_scrub_during_recovery', 'osd_scrub_begin_hour', 'abc']
+        data = self._get('/api/cluster_conf/filter?names={}'.format(','.join(config_names)))
+        self.assertStatus(200)
+        self.assertIsInstance(data, list)
+        self.assertEqual(len(data), 2)
+        for conf in data:
+            self._validate_single(conf)
+            self.assertIn(conf['name'], config_names)
+
+    def test_create(self):
+        config_name = 'debug_ms'
+        orig_value = self._get_config_by_name(config_name)
+
+        # remove all existing settings for equal preconditions
+        self._clear_all_values_for_config_option(config_name)
+
+        expected_result = [{'section': 'mon', 'value': '0/3'}]
+
+        self._post('/api/cluster_conf', {
+            'name': config_name,
+            'value': expected_result
+        })
+        self.assertStatus(201)
+        self.wait_until_equal(
+            lambda: self._get_config_by_name(config_name),
+            expected_result,
+            timeout=30,
+            period=1)
+
+        # reset original value
+        self._clear_all_values_for_config_option(config_name)
+        self._reset_original_values(config_name, orig_value)
+
+    def test_delete(self):
+        config_name = 'debug_ms'
+        orig_value = self._get_config_by_name(config_name)
+
+        # set a config option
+        expected_result = [{'section': 'mon', 'value': '0/3'}]
+        self._post('/api/cluster_conf', {
+            'name': config_name,
+            'value': expected_result
+        })
+        self.assertStatus(201)
+        self.wait_until_equal(
+            lambda: self._get_config_by_name(config_name),
+            expected_result,
+            timeout=30,
+            period=1)
+
+        # delete it and check if it's deleted
+        self._delete('/api/cluster_conf/{}?section={}'.format(config_name, 'mon'))
+        self.assertStatus(204)
+        self.wait_until_equal(
+            lambda: self._get_config_by_name(config_name),
+            None,
+            timeout=30,
+            period=1)
+
+        # reset original value
+        self._clear_all_values_for_config_option(config_name)
+        self._reset_original_values(config_name, orig_value)
+
+    def test_create_cant_update_at_runtime(self):
+        config_name = 'public_bind_addr'  # not updatable
+        config_value = [{'section': 'global', 'value': 'true'}]
+        orig_value = self._get_config_by_name(config_name)
+
+        # try to set config option and check if it fails
+        self._post('/api/cluster_conf', {
+            'name': config_name,
+            'value': config_value
+        })
+        self.assertStatus(400)
+        self.assertError(code='config_option_not_updatable_at_runtime',
+                         component='cluster_configuration',
+                         detail='Config option {} is/are not updatable at runtime'.format(
+                             config_name))
+
+        # check if config option value is still the original one
+        self.wait_until_equal(
+            lambda: self._get_config_by_name(config_name),
+            orig_value,
+            timeout=30,
+            period=1)
+
+    def test_create_two_values(self):
+        config_name = 'debug_ms'
+        orig_value = self._get_config_by_name(config_name)
+
+        # remove all existing settings for equal preconditions
+        self._clear_all_values_for_config_option(config_name)
+
+        expected_result = [{'section': 'mon', 'value': '0/3'},
+                           {'section': 'osd', 'value': '0/5'}]
+
+        self._post('/api/cluster_conf', {
+            'name': config_name,
+            'value': expected_result
+        })
+        self.assertStatus(201)
+        self.wait_until_equal(
+            lambda: self._get_config_by_name(config_name),
+            expected_result,
+            timeout=30,
+            period=1)
+
+        # reset original value
+        self._clear_all_values_for_config_option(config_name)
+        self._reset_original_values(config_name, orig_value)
+
+    def test_create_can_handle_none_values(self):
+        config_name = 'debug_ms'
+        orig_value = self._get_config_by_name(config_name)
+
+        # remove all existing settings for equal preconditions
+        self._clear_all_values_for_config_option(config_name)
+
+        self._post('/api/cluster_conf', {
+            'name': config_name,
+            'value': [{'section': 'mon', 'value': '0/3'},
+                      {'section': 'osd', 'value': None}]
+        })
+        self.assertStatus(201)
+
+        expected_result = [{'section': 'mon', 'value': '0/3'}]
+        self.wait_until_equal(
+            lambda: self._get_config_by_name(config_name),
+            expected_result,
+            timeout=30,
+            period=1)
+
+        # reset original value
+        self._clear_all_values_for_config_option(config_name)
+        self._reset_original_values(config_name, orig_value)
+
+    def test_create_can_handle_boolean_values(self):
+        config_name = 'mon_allow_pool_delete'
+        orig_value = self._get_config_by_name(config_name)
+
+        # remove all existing settings for equal preconditions
+        self._clear_all_values_for_config_option(config_name)
+
+        expected_result = [{'section': 'mon', 'value': 'true'}]
+
+        self._post('/api/cluster_conf', {
+            'name': config_name,
+            'value': [{'section': 'mon', 'value': True}]})
+        self.assertStatus(201)
+
+        self.wait_until_equal(
+            lambda: self._get_config_by_name(config_name),
+            expected_result,
+            timeout=30,
+            period=1)
+
+        # reset original value
+        self._clear_all_values_for_config_option(config_name)
+        self._reset_original_values(config_name, orig_value)
+
+    def test_bulk_set(self):
+        expected_result = {
+            'osd_max_backfills': {'section': 'osd', 'value': '1'},
+            'osd_recovery_max_active': {'section': 'osd', 'value': '3'},
+            'osd_recovery_max_single_start': {'section': 'osd', 'value': '1'},
+            'osd_recovery_sleep': {'section': 'osd', 'value': '2.000000'}
+        }
+        orig_values = dict()
+
+        for config_name in expected_result:
+            orig_values[config_name] = self._get_config_by_name(config_name)
+
+            # remove all existing settings for equal preconditions
+            self._clear_all_values_for_config_option(config_name)
+
+        self._put('/api/cluster_conf', {'options': expected_result})
+        self.assertStatus(200)
+
+        for config_name, value in expected_result.items():
+            self.wait_until_equal(
+                lambda: self._get_config_by_name(config_name),
+                [value],
+                timeout=30,
+                period=1)
+
+            # reset original value
+            self._clear_all_values_for_config_option(config_name)
+            self._reset_original_values(config_name, orig_values[config_name])
+
+    def test_bulk_set_cant_update_at_runtime(self):
+        config_options = {
+            'public_bind_addr': {'section': 'global', 'value': '1.2.3.4:567'},  # not updatable
+            'public_network': {'section': 'global', 'value': '10.0.0.0/8'}  # not updatable
+        }
+        orig_values = dict()
+
+        for config_name in config_options:
+            orig_values[config_name] = self._get_config_by_name(config_name)
+
+        # try to set config options and see if it fails
+        self._put('/api/cluster_conf', {'options': config_options})
+        self.assertStatus(400)
+        self.assertError(code='config_option_not_updatable_at_runtime',
+                         component='cluster_configuration',
+                         detail='Config option {} is/are not updatable at runtime'.format(
+                             ', '.join(config_options.keys())))
+
+        # check if config option values are still the original ones
+        for config_name, value in orig_values.items():
+            self.wait_until_equal(
+                lambda: self._get_config_by_name(config_name),
+                value,
+                timeout=30,
+                period=1)
+
+    def test_bulk_set_cant_update_at_runtime_partial(self):
+        config_options = {
+            'public_bind_addr': {'section': 'global', 'value': 'true'},  # not updatable
+            'log_to_stderr': {'section': 'global', 'value': 'true'}  # updatable
+        }
+        orig_values = dict()
+
+        for config_name in config_options:
+            orig_values[config_name] = self._get_config_by_name(config_name)
+
+        # try to set config options and see if it fails
+        self._put('/api/cluster_conf', {'options': config_options})
+        self.assertStatus(400)
+        self.assertError(code='config_option_not_updatable_at_runtime',
+                         component='cluster_configuration',
+                         detail='Config option {} is/are not updatable at runtime'.format(
+                             'public_bind_addr'))
+
+        # check if config option values are still the original ones
+        for config_name, value in orig_values.items():
+            self.wait_until_equal(
+                lambda: self._get_config_by_name(config_name),
+                value,
+                timeout=30,
+                period=1)
+
+    def test_check_existence(self):
+        """
+        This test case is intended to check the existence of all hard coded config options used by
+        the dashboard.
+        If you include further hard coded options in the dashboard, feel free to add them to the
+        list.
+        """
+        hard_coded_options = [
+            'osd_max_backfills',  # osd-recv-speed
+            'osd_recovery_max_active',  # osd-recv-speed
+            'osd_recovery_max_single_start',  # osd-recv-speed
+            'osd_recovery_sleep',  # osd-recv-speed
+            'osd_scrub_during_recovery',  # osd-pg-scrub
+            'osd_scrub_begin_hour',  # osd-pg-scrub
+            'osd_scrub_end_hour',  # osd-pg-scrub
+            'osd_scrub_begin_week_day',  # osd-pg-scrub
+            'osd_scrub_end_week_day',  # osd-pg-scrub
+            'osd_scrub_min_interval',  # osd-pg-scrub
+            'osd_scrub_max_interval',  # osd-pg-scrub
+            'osd_deep_scrub_interval',  # osd-pg-scrub
+            'osd_scrub_auto_repair',  # osd-pg-scrub
+            'osd_max_scrubs',  # osd-pg-scrub
+            'osd_scrub_priority',  # osd-pg-scrub
+            'osd_scrub_sleep',  # osd-pg-scrub
+            'osd_scrub_auto_repair_num_errors',  # osd-pg-scrub
+            'osd_debug_deep_scrub_sleep',  # osd-pg-scrub
+            'osd_deep_scrub_keys',  # osd-pg-scrub
+            'osd_deep_scrub_large_omap_object_key_threshold',  # osd-pg-scrub
+            'osd_deep_scrub_large_omap_object_value_sum_threshold',  # osd-pg-scrub
+            'osd_deep_scrub_randomize_ratio',  # osd-pg-scrub
+            'osd_deep_scrub_stride',  # osd-pg-scrub
+            'osd_deep_scrub_update_digest_min_age',  # osd-pg-scrub
+            'osd_requested_scrub_priority',  # osd-pg-scrub
+            'osd_scrub_backoff_ratio',  # osd-pg-scrub
+            'osd_scrub_chunk_max',  # osd-pg-scrub
+            'osd_scrub_chunk_min',  # osd-pg-scrub
+            'osd_scrub_cost',  # osd-pg-scrub
+            'osd_scrub_interval_randomize_ratio',  # osd-pg-scrub
+            'osd_scrub_invalid_stats',  # osd-pg-scrub
+            'osd_scrub_load_threshold',  # osd-pg-scrub
+            'osd_scrub_max_preemptions',  # osd-pg-scrub
+            'mon_allow_pool_delete'  # pool-list
+        ]
+
+        for config_option in hard_coded_options:
+            self._get('/api/cluster_conf/{}'.format(config_option))
+            self.assertStatus(200)
+
+    def _validate_single(self, data):
+        self.assertIn('name', data)
+        self.assertIn('daemon_default', data)
+        self.assertIn('long_desc', data)
+        self.assertIn('level', data)
+        self.assertIn('default', data)
+        self.assertIn('see_also', data)
+        self.assertIn('tags', data)
+        self.assertIn('min', data)
+        self.assertIn('max', data)
+        self.assertIn('services', data)
+        self.assertIn('type', data)
+        self.assertIn('desc', data)
+        self.assertIn(data['type'], ['str', 'bool', 'float', 'int', 'size', 'uint', 'addr',
+                                     'addrvec', 'uuid', 'secs', 'millisecs'])
+
+        if 'value' in data:
+            self.assertIn('source', data)
+            self.assertIsInstance(data['value'], list)
+
+            for entry in data['value']:
+                self.assertIsInstance(entry, dict)
+                self.assertIn('section', entry)
+                self.assertIn('value', entry)
+
+    def _get_config_by_name(self, conf_name):
+        data = self._get('/api/cluster_conf/{}'.format(conf_name))
+        if 'value' in data:
+            return data['value']
+        return None
+
+    def _clear_all_values_for_config_option(self, config_name):
+        values = self._get_config_by_name(config_name)
+        if values:
+            for value in values:
+                self._ceph_cmd(['config', 'rm', value['section'], config_name])
+
+    def _reset_original_values(self, config_name, orig_values):
+        if orig_values:
+            for value in orig_values:
+                self._ceph_cmd(['config', 'set', value['section'], config_name, value['value']])
diff --git a/qa/tasks/mgr/dashboard/test_crush_rule.py b/qa/tasks/mgr/dashboard/test_crush_rule.py
new file mode 100644
index 000000000..1e37553b2
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_crush_rule.py
@@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import
+
+from .helper import DashboardTestCase, JList, JObj
+
+
+class CrushRuleTest(DashboardTestCase):
+
+    AUTH_ROLES = ['pool-manager']
+
+    rule_schema = JObj(sub_elems={
+        'max_size': int,
+        'min_size': int,
+        'rule_id': int,
+        'rule_name': str,
+        'ruleset': int,
+        'steps': JList(JObj({}, allow_unknown=True))
+    }, allow_unknown=True)
+
+    def create_and_delete_rule(self, data):
+        name = data['name']
+        # Creates rule
+        self._post('/api/crush_rule', data)
+        self.assertStatus(201)
+        # Makes sure rule exists
+        rule = self._get('/api/crush_rule/{}'.format(name))
+        self.assertStatus(200)
+        self.assertSchemaBody(self.rule_schema)
+        self.assertEqual(rule['rule_name'], name)
+        # Deletes rule
+        self._delete('/api/crush_rule/{}'.format(name))
+        self.assertStatus(204)
+
+    @DashboardTestCase.RunAs('test', 'test', ['rgw-manager'])
+    def test_read_access_permissions(self):
+        self._get('/api/crush_rule')
+        self.assertStatus(403)
+
+    @DashboardTestCase.RunAs('test', 'test', ['read-only'])
+    def test_write_access_permissions(self):
+        self._get('/api/crush_rule')
+        self.assertStatus(200)
+        data = {'name': 'some_rule', 'root': 'default', 'failure_domain': 'osd'}
+        self._post('/api/crush_rule', data)
+        self.assertStatus(403)
+        self._delete('/api/crush_rule/default')
+        self.assertStatus(403)
+
+    @classmethod
+    def tearDownClass(cls):
+        super(CrushRuleTest, cls).tearDownClass()
+        cls._ceph_cmd(['osd', 'crush', 'rule', 'rm', 'some_rule'])
+        cls._ceph_cmd(['osd', 'crush', 'rule', 'rm', 'another_rule'])
+
+    def test_list(self):
+        self._get('/api/crush_rule')
+        self.assertStatus(200)
+        self.assertSchemaBody(JList(self.rule_schema))
+
+    def test_create(self):
+        self.create_and_delete_rule({
+            'name': 'some_rule',
+            'root': 'default',
+            'failure_domain': 'osd'
+        })
+
+    @DashboardTestCase.RunAs('test', 'test', ['pool-manager', 'cluster-manager'])
+    def test_create_with_ssd(self):
+        data = self._get('/api/osd/0')
+        self.assertStatus(200)
+        device_class = data['osd_metadata']['default_device_class']
+        self.create_and_delete_rule({
+            'name': 'another_rule',
+            'root': 'default',
+            'failure_domain': 'osd',
+            'device_class': device_class
+        })
+
+    def test_crush_rule_info(self):
+        self._get('/ui-api/crush_rule/info')
+        self.assertStatus(200)
+        self.assertSchemaBody(JObj({
+            'names': JList(str),
+            'nodes': JList(JObj({}, allow_unknown=True)),
+            'roots': JList(int)
+        }))
diff --git a/qa/tasks/mgr/dashboard/test_erasure_code_profile.py b/qa/tasks/mgr/dashboard/test_erasure_code_profile.py
new file mode 100644
index 000000000..7fb7c1c82
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_erasure_code_profile.py
@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import
+
+from .helper import DashboardTestCase, JList, JObj
+
+
+class ECPTest(DashboardTestCase):
+
+    AUTH_ROLES = ['pool-manager']
+
+    @DashboardTestCase.RunAs('test', 'test', ['rgw-manager'])
+    def test_read_access_permissions(self):
+        self._get('/api/erasure_code_profile')
+        self.assertStatus(403)
+
+    @DashboardTestCase.RunAs('test', 'test', ['read-only'])
+    def test_write_access_permissions(self):
+        self._get('/api/erasure_code_profile')
+        self.assertStatus(200)
+        data = {'name': 'ecp32', 'k': 3, 'm': 2}
+        self._post('/api/erasure_code_profile', data)
+        self.assertStatus(403)
+        self._delete('/api/erasure_code_profile/default')
+        self.assertStatus(403)
+
+    @classmethod
+    def tearDownClass(cls):
+        super(ECPTest, cls).tearDownClass()
+        cls._ceph_cmd(['osd', 'erasure-code-profile', 'rm', 'ecp32'])
+        cls._ceph_cmd(['osd', 'erasure-code-profile', 'rm', 'lrc'])
+
+    def test_list(self):
+        data = self._get('/api/erasure_code_profile')
+        self.assertStatus(200)
+
+        default = [p for p in data if p['name'] == 'default']
+        if default:
+            default_ecp = {
+                'k': 2,
+                'technique': 'reed_sol_van',
+                'm': 1,
+                'name': 'default',
+                'plugin': 'jerasure'
+            }
+            if 'crush-failure-domain' in default[0]:
+                default_ecp['crush-failure-domain'] = default[0]['crush-failure-domain']
+            self.assertSubset(default_ecp, default[0])
+            get_data = self._get('/api/erasure_code_profile/default')
+            self.assertEqual(get_data, default[0])
+
+    def test_create(self):
+        data = {'name': 'ecp32', 'k': 3, 'm': 2}
+        self._post('/api/erasure_code_profile', data)
+        self.assertStatus(201)
+
+        self._get('/api/erasure_code_profile/ecp32')
+        self.assertJsonSubset({
+            'crush-device-class': '',
+            'crush-failure-domain': 'osd',
+            'crush-root': 'default',
+            'jerasure-per-chunk-alignment': 'false',
+            'k': 3,
+            'm': 2,
+            'name': 'ecp32',
+            'plugin': 'jerasure',
+            'technique': 'reed_sol_van',
+        })
+
+        self.assertStatus(200)
+
+        self._delete('/api/erasure_code_profile/ecp32')
+        self.assertStatus(204)
+
+    def test_create_plugin(self):
+        data = {'name': 'lrc', 'k': '2', 'm': '2', 'l': '2', 'plugin': 'lrc'}
+        self._post('/api/erasure_code_profile', data)
+        self.assertJsonBody(None)
+        self.assertStatus(201)
+
+        self._get('/api/erasure_code_profile/lrc')
+        self.assertJsonBody({
+            'crush-device-class': '',
+            'crush-failure-domain': 'host',
+            'crush-root': 'default',
+            'k': 2,
+            'l': '2',
+            'm': 2,
+            'name': 'lrc',
+            'plugin': 'lrc'
+        })
+
+        self.assertStatus(200)
+
+        self._delete('/api/erasure_code_profile/lrc')
+        self.assertStatus(204)
+
+    def test_ecp_info(self):
+        self._get('/ui-api/erasure_code_profile/info')
+        self.assertSchemaBody(JObj({
+            'names': JList(str),
+            'plugins': JList(str),
+            'directory': str,
+            'nodes': JList(JObj({}, allow_unknown=True))
+        }))
diff --git a/qa/tasks/mgr/dashboard/test_health.py b/qa/tasks/mgr/dashboard/test_health.py
new file mode 100644
index 000000000..693b7b65c
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_health.py
@@ -0,0 +1,301 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+from .helper import (DashboardTestCase, JAny, JLeaf, JList, JObj,
+                     addrvec_schema, module_options_schema)
+
+
+class HealthTest(DashboardTestCase):
+    CEPHFS = True
+
+    __pg_info_schema = JObj({
+        'object_stats': JObj({
+            'num_objects': int,
+            'num_object_copies': int,
+            'num_objects_degraded': int,
+            'num_objects_misplaced': int,
+            'num_objects_unfound': int
+        }),
+        'pgs_per_osd': float,
+        'statuses': JObj({}, allow_unknown=True, unknown_schema=int)
+    })
+
+    __mdsmap_schema = JObj({
+        'session_autoclose': int,
+        'balancer': str,
+        'up': JObj({}, allow_unknown=True),
+        'last_failure_osd_epoch': int,
+        'in': JList(int),
+        'last_failure': int,
+        'max_file_size': int,
+        'explicitly_allowed_features': int,
+        'damaged': JList(int),
+        'tableserver': int,
+        'failed': JList(int),
+        'metadata_pool': int,
+        'epoch': int,
+        'stopped': JList(int),
+        'max_mds': int,
+        'compat': JObj({
+            'compat': JObj({}, allow_unknown=True),
+            'ro_compat': JObj({}, allow_unknown=True),
+            'incompat': JObj({}, allow_unknown=True)
+        }),
+        'required_client_features': JObj({}, allow_unknown=True),
+        'data_pools': JList(int),
+        'info': JObj({}, allow_unknown=True),
+        'fs_name': str,
+        'created': str,
+        'standby_count_wanted': int,
+        'enabled': bool,
+        'modified': str,
+        'session_timeout': int,
+        'flags': int,
+        'ever_allowed_features': int,
+        'root': int
+    })
+
+    def test_minimal_health(self):
+        data = self._get('/api/health/minimal')
+        self.assertStatus(200)
+        schema = JObj({
+            'client_perf': JObj({
+                'read_bytes_sec': int,
+                'read_op_per_sec': int,
+                'recovering_bytes_per_sec': int,
+                'write_bytes_sec': int,
+                'write_op_per_sec': int
+            }),
+            'df': JObj({
+                'stats': JObj({
+                    'total_avail_bytes': int,
+                    'total_bytes': int,
+                    'total_used_raw_bytes': int,
+                })
+            }),
+            'fs_map': JObj({
+                'filesystems': JList(
+                    JObj({
+                        'mdsmap': self.__mdsmap_schema
+                    }),
+                ),
+                'standbys': JList(JObj({}, allow_unknown=True)),
+            }),
+            'health': JObj({
+                'checks': JList(JObj({}, allow_unknown=True)),
+                'mutes': JList(JObj({}, allow_unknown=True)),
+                'status': str,
+            }),
+            'hosts': int,
+            'iscsi_daemons': JObj({
+                'up': int,
+                'down': int
+            }),
+            'mgr_map': JObj({
+                'active_name': str,
+                'standbys': JList(JLeaf(dict))
+            }),
+            'mon_status': JObj({
+                'monmap': JObj({
+                    'mons': JList(JLeaf(dict)),
+                }),
+                'quorum': JList(int)
+            }),
+            'osd_map': JObj({
+                'osds': JList(
+                    JObj({
+                        'in': int,
+                        'up': int,
+                        'state': JList(str)
+                    })),
+            }),
+            'pg_info': self.__pg_info_schema,
+            'pools': JList(JLeaf(dict)),
+            'rgw': int,
+            'scrub_status': str
+        })
+        self.assertSchema(data, schema)
+
+    def test_full_health(self):
+        data = self._get('/api/health/full')
+        self.assertStatus(200)
+        module_info_schema = JObj({
+            'can_run': bool,
+            'error_string': str,
+            'name': str,
+            'module_options': module_options_schema
+        })
+        schema = JObj({
+            'client_perf': JObj({
+                'read_bytes_sec': int,
+                'read_op_per_sec': int,
+                'recovering_bytes_per_sec': int,
+                'write_bytes_sec': int,
+                'write_op_per_sec': int
+            }),
+            'df': JObj({
+                'pools': JList(JObj({
+                    'stats': JObj({
+                        'stored': int,
+                        'stored_data': int,
+                        'stored_omap': int,
+                        'objects': int,
+                        'kb_used': int,
+                        'bytes_used': int,
+                        'data_bytes_used': int,
+                        'omap_bytes_used': int,
+                        'percent_used': float,
+                        'max_avail': int,
+                        'quota_objects': int,
+                        'quota_bytes': int,
+                        'dirty': int,
+                        'rd': int,
+                        'rd_bytes': int,
+                        'wr': int,
+                        'wr_bytes': int,
+                        'compress_bytes_used': int,
+                        'compress_under_bytes': int,
+                        'stored_raw': int,
+                        'avail_raw': int
+                    }),
+                    'name': str,
+                    'id': int
+                })),
+                'stats': JObj({
+                    'total_avail_bytes': int,
+                    'total_bytes': int,
+                    'total_used_bytes': int,
+                    'total_used_raw_bytes': int,
+                    'total_used_raw_ratio': float,
+                    'num_osds': int,
+                    'num_per_pool_osds': int,
+                    'num_per_pool_omap_osds': int
+                })
+            }),
+            'fs_map': JObj({
+                'compat': JObj({
+                    'compat': JObj({}, allow_unknown=True, unknown_schema=str),
+                    'incompat': JObj(
+                        {}, allow_unknown=True, unknown_schema=str),
+                    'ro_compat': JObj(
+                        {}, allow_unknown=True, unknown_schema=str)
+                }),
+                'default_fscid': int,
+                'epoch': int,
+                'feature_flags': JObj(
+                    {}, allow_unknown=True, unknown_schema=bool),
+                'filesystems': JList(
+                    JObj({
+                        'id': int,
+                        'mdsmap': self.__mdsmap_schema
+                    }),
+                ),
+                'standbys': JList(JObj({}, allow_unknown=True)),
+            }),
+            'health': JObj({
+                'checks': JList(JObj({}, allow_unknown=True)),
+                'mutes': JList(JObj({}, allow_unknown=True)),
+                'status': str,
+            }),
+            'hosts': int,
+            'iscsi_daemons': JObj({
+                'up': int,
+                'down': int
+            }),
+            'mgr_map': JObj({
+                'active_addr': str,
+                'active_addrs': JObj({
+                    'addrvec': addrvec_schema
+                }),
+                'active_change': str,  # timestamp
+                'active_mgr_features': int,
+                'active_gid': int,
+                'active_name': str,
+                'always_on_modules': JObj({}, allow_unknown=True),
+                'available': bool,
+                'available_modules': JList(module_info_schema),
+                'epoch': int,
+                'modules': JList(str),
+                'services': JObj(
+                    {'dashboard': str},  # This module should always be present
+                    allow_unknown=True, unknown_schema=str
+                ),
+                'standbys': JList(JObj({
+                    'available_modules': JList(module_info_schema),
+                    'gid': int,
+                    'name': str,
+                    'mgr_features': int
+                }, allow_unknown=True))
+            }, allow_unknown=True),
+            'mon_status': JObj({
+                'election_epoch': int,
+                'extra_probe_peers': JList(JAny(none=True)),
+                'feature_map': JObj(
+                    {}, allow_unknown=True, unknown_schema=JList(JObj({
+                        'features': str,
+                        'num': int,
+                        'release': str
+                    }))
+                ),
+                'features': JObj({
+                    'quorum_con': str,
+                    'quorum_mon': JList(str),
+                    'required_con': str,
+                    'required_mon': JList(str)
+                }),
+                'monmap': JObj({
+                    # @TODO: expand on monmap schema
+                    'mons': JList(JLeaf(dict)),
+                }, allow_unknown=True),
+                'name': str,
+                'outside_quorum': JList(int),
+                'quorum': JList(int),
+                'quorum_age': int,
+                'rank': int,
+                'state': str,
+                # @TODO: What type should be expected here?
+                'sync_provider': JList(JAny(none=True)),
+                'stretch_mode': bool
+            }),
+            'osd_map': JObj({
+                # @TODO: define schema for crush map and osd_metadata, among
+                # others
+                'osds': JList(
+                    JObj({
+                        'in': int,
+                        'up': int,
+                    }, allow_unknown=True)),
+            }, allow_unknown=True),
+            'pg_info': self.__pg_info_schema,
+            'pools': JList(JLeaf(dict)),
+            'rgw': int,
+            'scrub_status': str
+        })
+        self.assertSchema(data, schema)
+
+        cluster_pools = self.ceph_cluster.mon_manager.list_pools()
+        self.assertEqual(len(cluster_pools), len(data['pools']))
+        for pool in data['pools']:
+            self.assertIn(pool['pool_name'], cluster_pools)
+
+    @DashboardTestCase.RunAs('test', 'test', ['pool-manager'])
+    def test_health_permissions(self):
+        data = self._get('/api/health/full')
+        self.assertStatus(200)
+
+        schema = JObj({
+            'client_perf': JObj({}, allow_unknown=True),
+            'df': JObj({}, allow_unknown=True),
+            'health': JObj({
+                'checks': JList(JObj({}, allow_unknown=True)),
+                'mutes': JList(JObj({}, allow_unknown=True)),
+                'status': str
+            }),
+            'pools': JList(JLeaf(dict)),
+        })
+        self.assertSchema(data, schema)
+
+        cluster_pools = self.ceph_cluster.mon_manager.list_pools()
+        self.assertEqual(len(cluster_pools), len(data['pools']))
+        for pool in data['pools']:
+            self.assertIn(pool['pool_name'], cluster_pools)
diff --git a/qa/tasks/mgr/dashboard/test_host.py b/qa/tasks/mgr/dashboard/test_host.py
new file mode 100644
index 000000000..78d784473
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_host.py
@@ -0,0 +1,158 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+from .helper import DashboardTestCase, JList, JObj, devices_schema
+
+
+class HostControllerTest(DashboardTestCase):
+
+    AUTH_ROLES = ['read-only']
+
+    URL_HOST = '/api/host'
+    URL_UI_HOST = '/ui-api/host'
+
+    ORCHESTRATOR = True
+
+    @classmethod
+    def setUpClass(cls):
+        super(HostControllerTest, cls).setUpClass()
+
+    @classmethod
+    def tearDownClass(cls):
+        cmd = ['test_orchestrator', 'load_data', '-i', '-']
+        cls.mgr_cluster.mon_manager.raw_cluster_cmd_result(*cmd, stdin='{}')
+
+    @property
+    def test_data_inventory(self):
+        return self.ORCHESTRATOR_TEST_DATA['inventory']
+
+    @property
+    def test_data_daemons(self):
+        return self.ORCHESTRATOR_TEST_DATA['daemons']
+
+    @DashboardTestCase.RunAs('test', 'test', ['block-manager'])
+    def test_access_permissions(self):
+        self._get(self.URL_HOST, version='1.1')
+        self.assertStatus(403)
+
+    def test_host_list(self):
+        data = self._get(self.URL_HOST, version='1.1')
+        self.assertStatus(200)
+
+        orch_hostnames = {inventory_node['name'] for inventory_node in
+                          self.ORCHESTRATOR_TEST_DATA['inventory']}
+
+        for server in data:
+            self.assertIn('services', server)
+            self.assertIn('hostname', server)
+            self.assertIn('ceph_version', server)
+            self.assertIsNotNone(server['hostname'])
+            self.assertIsNotNone(server['ceph_version'])
+            for service in server['services']:
+                self.assertIn('type', service)
+                self.assertIn('id', service)
+                self.assertIsNotNone(service['type'])
+                self.assertIsNotNone(service['id'])
+
+            self.assertIn('sources', server)
+            in_ceph, in_orchestrator = server['sources']['ceph'], server['sources']['orchestrator']
+            if in_ceph:
+                self.assertGreaterEqual(len(server['services']), 1)
+                if not in_orchestrator:
+                    self.assertNotIn(server['hostname'], orch_hostnames)
+            if in_orchestrator:
+                self.assertEqual(len(server['services']), 0)
+                self.assertIn(server['hostname'], orch_hostnames)
+
+    def test_host_list_with_sources(self):
+        data = self._get('{}?sources=orchestrator'.format(self.URL_HOST), version='1.1')
+        self.assertStatus(200)
+        test_hostnames = {inventory_node['name'] for inventory_node in
+                          self.ORCHESTRATOR_TEST_DATA['inventory']}
+        resp_hostnames = {host['hostname'] for host in data}
+        self.assertEqual(test_hostnames, resp_hostnames)
+
+        data = self._get('{}?sources=ceph'.format(self.URL_HOST), version='1.1')
+        self.assertStatus(200)
+        test_hostnames = {inventory_node['name'] for inventory_node in
+                          self.ORCHESTRATOR_TEST_DATA['inventory']}
+        resp_hostnames = {host['hostname'] for host in data}
+        self.assertEqual(len(test_hostnames.intersection(resp_hostnames)), 0)
+
+    def test_host_devices(self):
+        hosts = self._get('{}'.format(self.URL_HOST), version='1.1')
+        hosts = [host['hostname'] for host in hosts if host['hostname'] != '']
+        assert hosts[0]
+        data = self._get('{}/devices'.format('{}/{}'.format(self.URL_HOST, hosts[0])))
+        self.assertStatus(200)
+        self.assertSchema(data, devices_schema)
+
+    def test_host_daemons(self):
+        hosts = self._get('{}'.format(self.URL_HOST), version='1.1')
+        hosts = [host['hostname'] for host in hosts if host['hostname'] != '']
+        assert hosts[0]
+        data = self._get('{}/daemons'.format('{}/{}'.format(self.URL_HOST, hosts[0])))
+        self.assertStatus(200)
+        self.assertSchema(data, JList(JObj({
+            'hostname': str,
+            'daemon_id': str,
+            'daemon_type': str
+        })))
+
+    def test_host_smart(self):
+        hosts = self._get('{}'.format(self.URL_HOST), version='1.1')
+        hosts = [host['hostname'] for host in hosts if host['hostname'] != '']
+        assert hosts[0]
+        self._get('{}/smart'.format('{}/{}'.format(self.URL_HOST, hosts[0])))
+        self.assertStatus(200)
+
+    def _validate_inventory(self, data, resp_data):
+        self.assertEqual(data['name'], resp_data['name'])
+        self.assertEqual(len(data['devices']), len(resp_data['devices']))
+
+        if not data['devices']:
+            return
+        test_devices = sorted(data['devices'], key=lambda d: d['path'])
+        resp_devices = sorted(resp_data['devices'], key=lambda d: d['path'])
+
+        for test, resp in zip(test_devices, resp_devices):
+            self._validate_device(test, resp)
+
+    def _validate_device(self, data, resp_data):
+        for key, value in data.items():
+            self.assertEqual(value, resp_data[key])
+
+    def test_inventory_get(self):
+        # get a inventory
+        node = self.test_data_inventory[0]
+        resp = self._get('{}/{}/inventory'.format(self.URL_HOST, node['name']))
+        self.assertStatus(200)
+        self._validate_inventory(node, resp)
+
+    def test_inventory_list(self):
+        # get all inventory
+        data = self._get('{}/inventory'.format(self.URL_UI_HOST))
+        self.assertStatus(200)
+
+        def sorting_key(node):
+            return node['name']
+
+        test_inventory = sorted(self.test_data_inventory, key=sorting_key)
+        resp_inventory = sorted(data, key=sorting_key)
+        self.assertEqual(len(test_inventory), len(resp_inventory))
+        for test, resp in zip(test_inventory, resp_inventory):
+            self._validate_inventory(test, resp)
+
+
+class HostControllerNoOrchestratorTest(DashboardTestCase):
+    def test_host_create(self):
+        self._post('/api/host?hostname=foo', {'status': ''}, version='0.1')
+        self.assertStatus(503)
+        self.assertError(code='orchestrator_status_unavailable',
+                         component='orchestrator')
+
+    def test_host_delete(self):
+        self._delete('/api/host/bar')
+        self.assertStatus(503)
+        self.assertError(code='orchestrator_status_unavailable',
+                         component='orchestrator')
diff --git a/qa/tasks/mgr/dashboard/test_logs.py b/qa/tasks/mgr/dashboard/test_logs.py
new file mode 100644
index 000000000..63f6e16ed
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_logs.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+from .helper import DashboardTestCase, JList, JObj, addrvec_schema
+
+
+class LogsTest(DashboardTestCase):
+    CEPHFS = True
+
+    def test_logs(self):
+        data = self._get("/api/logs/all")
+        self.assertStatus(200)
+        log_entry_schema = JList(JObj({
+            'addrs': JObj({
+                'addrvec': addrvec_schema
+            }),
+            'channel': str,
+            'message': str,
+            'name': str,
+            'priority': str,
+            'rank': str,
+            'seq': int,
+            'stamp': str
+        }))
+        schema = JObj({
+            'audit_log': log_entry_schema,
+            'clog': log_entry_schema
+        })
+        self.assertSchema(data, schema)
+
+    @DashboardTestCase.RunAs('test', 'test', ['pool-manager'])
+    def test_log_perms(self):
+        self._get("/api/logs/all")
+        self.assertStatus(403)
diff --git a/qa/tasks/mgr/dashboard/test_mgr_module.py b/qa/tasks/mgr/dashboard/test_mgr_module.py
new file mode 100644
index 000000000..40bfa0025
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_mgr_module.py
@@ -0,0 +1,150 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+import logging
+
+import requests
+
+from .helper import (DashboardTestCase, JLeaf, JList, JObj,
+                     module_options_object_schema, module_options_schema)
+
+logger = logging.getLogger(__name__)
+
+
+class MgrModuleTestCase(DashboardTestCase):
+    MGRS_REQUIRED = 1
+
+    def wait_until_rest_api_accessible(self):
+        """
+        Wait until the REST API is accessible.
+        """
+
+        def _check_connection():
+            try:
+                # Try reaching an API endpoint successfully.
+                self._get('/api/mgr/module')
+                if self._resp.status_code == 200:
+                    return True
+            except requests.ConnectionError:
+                pass
+            return False
+
+        self.wait_until_true(_check_connection, timeout=30)
+
+
+class MgrModuleTest(MgrModuleTestCase):
+
+    def test_list_disabled_module(self):
+        self._ceph_cmd(['mgr', 'module', 'disable', 'iostat'])
+        self.wait_until_rest_api_accessible()
+        data = self._get('/api/mgr/module')
+        self.assertStatus(200)
+        self.assertSchema(
+            data,
+            JList(
+                JObj(sub_elems={
+                    'name': JLeaf(str),
+                    'enabled': JLeaf(bool),
+                    'always_on': JLeaf(bool),
+                    'options': module_options_schema
+                })))
+        module_info = self.find_object_in_list('name', 'iostat', data)
+        self.assertIsNotNone(module_info)
+        self.assertFalse(module_info['enabled'])
+
+    def test_list_enabled_module(self):
+        self._ceph_cmd(['mgr', 'module', 'enable', 'iostat'])
+        self.wait_until_rest_api_accessible()
+        data = self._get('/api/mgr/module')
+        self.assertStatus(200)
+        self.assertSchema(
+            data,
+            JList(
+                JObj(sub_elems={
+                    'name': JLeaf(str),
+                    'enabled': JLeaf(bool),
+                    'always_on': JLeaf(bool),
+                    'options': module_options_schema
+                })))
+        module_info = self.find_object_in_list('name', 'iostat', data)
+        self.assertIsNotNone(module_info)
+        self.assertTrue(module_info['enabled'])
+
+    def test_get(self):
+        data = self._get('/api/mgr/module/telemetry')
+        self.assertStatus(200)
+        self.assertSchema(
+            data,
+            JObj(
+                allow_unknown=True,
+                sub_elems={
+                    'channel_basic': bool,
+                    'channel_ident': bool,
+                    'channel_crash': bool,
+                    'channel_device': bool,
+                    'contact': str,
+                    'description': str,
+                    'enabled': bool,
+                    'interval': int,
+                    'last_opt_revision': int,
+                    'leaderboard': bool,
+                    'organization': str,
+                    'proxy': str,
+                    'url': str
+                }))
+
+    def test_module_options(self):
+        data = self._get('/api/mgr/module/telemetry/options')
+        self.assertStatus(200)
+        schema = JObj({
+            'channel_basic': module_options_object_schema,
+            'channel_crash': module_options_object_schema,
+            'channel_device': module_options_object_schema,
+            'channel_ident': module_options_object_schema,
+            'contact': module_options_object_schema,
+            'description': module_options_object_schema,
+            'device_url': module_options_object_schema,
+            'enabled': module_options_object_schema,
+            'interval': module_options_object_schema,
+            'last_opt_revision': module_options_object_schema,
+            'leaderboard': module_options_object_schema,
+            'log_level': module_options_object_schema,
+            'log_to_cluster': module_options_object_schema,
+            'log_to_cluster_level': module_options_object_schema,
+            'log_to_file': module_options_object_schema,
+            'organization': module_options_object_schema,
+            'proxy': module_options_object_schema,
+            'url': module_options_object_schema
+        })
+        self.assertSchema(data, schema)
+
+    def test_module_enable(self):
+        self._post('/api/mgr/module/telemetry/enable')
+        self.assertStatus(200)
+
+    def test_disable(self):
+        self._post('/api/mgr/module/iostat/disable')
+        self.assertStatus(200)
+
+    def test_put(self):
+        self.set_config_key('config/mgr/mgr/iostat/log_level', 'critical')
+        self.set_config_key('config/mgr/mgr/iostat/log_to_cluster', 'False')
+        self.set_config_key('config/mgr/mgr/iostat/log_to_cluster_level', 'info')
+        self.set_config_key('config/mgr/mgr/iostat/log_to_file', 'True')
+        self._put(
+            '/api/mgr/module/iostat',
+            data={
+                'config': {
+                    'log_level': 'debug',
+                    'log_to_cluster': True,
+                    'log_to_cluster_level': 'warning',
+                    'log_to_file': False
+                }
+            })
+        self.assertStatus(200)
+        data = self._get('/api/mgr/module/iostat')
+        self.assertStatus(200)
+        self.assertEqual(data['log_level'], 'debug')
+        self.assertTrue(data['log_to_cluster'])
+        self.assertEqual(data['log_to_cluster_level'], 'warning')
+        self.assertFalse(data['log_to_file'])
diff --git a/qa/tasks/mgr/dashboard/test_monitor.py b/qa/tasks/mgr/dashboard/test_monitor.py
new file mode 100644
index 000000000..e32c2c10c
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_monitor.py
@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+from .helper import DashboardTestCase
+
+
+class MonitorTest(DashboardTestCase):
+    AUTH_ROLES = ['cluster-manager']
+
+    @DashboardTestCase.RunAs('test', 'test', ['block-manager'])
+    def test_access_permissions(self):
+        self._get('/api/monitor')
+        self.assertStatus(403)
+
+    def test_monitor_default(self):
+        data = self._get("/api/monitor")
+        self.assertStatus(200)
+
+        self.assertIn('mon_status', data)
+        self.assertIn('in_quorum', data)
+        self.assertIn('out_quorum', data)
+        self.assertIsNotNone(data['mon_status'])
+        self.assertIsNotNone(data['in_quorum'])
+        self.assertIsNotNone(data['out_quorum'])
diff --git a/qa/tasks/mgr/dashboard/test_motd.py b/qa/tasks/mgr/dashboard/test_motd.py
new file mode 100644
index 000000000..2edbf36ba
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_motd.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+# pylint: disable=too-many-public-methods
+
+from __future__ import absolute_import
+
+import time
+
+from .helper import DashboardTestCase
+
+
+class MotdTest(DashboardTestCase):
+    @classmethod
+    def tearDownClass(cls):
+        cls._ceph_cmd(['dashboard', 'motd', 'clear'])
+        super(MotdTest, cls).tearDownClass()
+
+    def setUp(self):
+        super(MotdTest, self).setUp()
+        self._ceph_cmd(['dashboard', 'motd', 'clear'])
+
+    def test_none(self):
+        data = self._get('/ui-api/motd')
+        self.assertStatus(200)
+        self.assertIsNone(data)
+
+    def test_set(self):
+        self._ceph_cmd(['dashboard', 'motd', 'set', 'info', '0', 'foo bar baz'])
+        data = self._get('/ui-api/motd')
+        self.assertStatus(200)
+        self.assertIsInstance(data, dict)
+
+    def test_expired(self):
+        self._ceph_cmd(['dashboard', 'motd', 'set', 'info', '2s', 'foo bar baz'])
+        time.sleep(5)
+        data = self._get('/ui-api/motd')
+        self.assertStatus(200)
+        self.assertIsNone(data)
diff --git a/qa/tasks/mgr/dashboard/test_orchestrator.py b/qa/tasks/mgr/dashboard/test_orchestrator.py
new file mode 100644
index 000000000..2a804c4c2
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_orchestrator.py
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+from .helper import DashboardTestCase
+
+
+class OrchestratorControllerTest(DashboardTestCase):
+
+    AUTH_ROLES = ['cluster-manager']
+
+    URL_STATUS = '/ui-api/orchestrator/status'
+
+    ORCHESTRATOR = True
+
+    @classmethod
+    def setUpClass(cls):
+        super(OrchestratorControllerTest, cls).setUpClass()
+
+    @classmethod
+    def tearDownClass(cls):
+        cmd = ['test_orchestrator', 'load_data', '-i', '-']
+        cls.mgr_cluster.mon_manager.raw_cluster_cmd_result(*cmd, stdin='{}')
+
+    def test_status_get(self):
+        data = self._get(self.URL_STATUS)
+        self.assertStatus(200)
+        self.assertTrue(data['available'])
diff --git a/qa/tasks/mgr/dashboard/test_osd.py b/qa/tasks/mgr/dashboard/test_osd.py
new file mode 100644
index 000000000..71cf3d871
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_osd.py
@@ -0,0 +1,368 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import
+
+import json
+
+from .helper import (DashboardTestCase, JAny, JLeaf, JList, JObj, JTuple,
+                     devices_schema)
+
+
+class OsdTest(DashboardTestCase):
+
+    AUTH_ROLES = ['cluster-manager']
+
+    @classmethod
+    def setUpClass(cls):
+        super(OsdTest, cls).setUpClass()
+        cls._load_module('test_orchestrator')
+        cmd = ['orch', 'set', 'backend', 'test_orchestrator']
+        cls.mgr_cluster.mon_manager.raw_cluster_cmd(*cmd)
+
+    def tearDown(self):
+        self._put('/api/osd/0/mark', data={'action': 'in'})
+
+    @DashboardTestCase.RunAs('test', 'test', ['block-manager'])
+    def test_access_permissions(self):
+        self._get('/api/osd')
+        self.assertStatus(403)
+        self._get('/api/osd/0')
+        self.assertStatus(403)
+
+    def assert_in_and_not_none(self, data, properties):
+        self.assertSchema(data, JObj({p: JAny(none=False) for p in properties}, allow_unknown=True))
+
+    def test_list(self):
+        data = self._get('/api/osd')
+        self.assertStatus(200)
+
+        self.assertGreaterEqual(len(data), 1)
+        data = data[0]
+        self.assert_in_and_not_none(data, ['host', 'tree', 'state', 'stats', 'stats_history'])
+        self.assert_in_and_not_none(data['host'], ['name'])
+        self.assert_in_and_not_none(data['tree'], ['id'])
+        self.assert_in_and_not_none(data['stats'], ['numpg', 'stat_bytes_used', 'stat_bytes',
+                                                    'op_r', 'op_w'])
+        self.assert_in_and_not_none(data['stats_history'], ['op_out_bytes', 'op_in_bytes'])
+        self.assertSchema(data['stats_history']['op_out_bytes'],
+                          JList(JTuple([JLeaf(float), JLeaf(float)])))
+
+    def test_details(self):
+        data = self._get('/api/osd/0')
+        self.assertStatus(200)
+        self.assert_in_and_not_none(data, ['osd_metadata'])
+
+    def test_histogram(self):
+        data = self._get('/api/osd/0/histogram')
+        self.assertStatus(200)
+        self.assert_in_and_not_none(data['osd'], ['op_w_latency_in_bytes_histogram',
+                                                  'op_r_latency_out_bytes_histogram'])
+
+    def test_scrub(self):
+        self._post('/api/osd/0/scrub?deep=False')
+        self.assertStatus(200)
+
+        self._post('/api/osd/0/scrub?deep=True')
+        self.assertStatus(200)
+
+    def test_safe_to_delete(self):
+        data = self._get('/api/osd/safe_to_delete?svc_ids=0')
+        self.assertStatus(200)
+        self.assertSchema(data, JObj({
+            'is_safe_to_delete': JAny(none=True),
+            'message': str
+        }))
+        self.assertTrue(data['is_safe_to_delete'])
+
+    def test_osd_smart(self):
+        self._get('/api/osd/0/smart')
+        self.assertStatus(200)
+
+    def test_mark_out_and_in(self):
+        self._put('/api/osd/0/mark', data={'action': 'out'})
+        self.assertStatus(200)
+
+        self._put('/api/osd/0/mark', data={'action': 'in'})
+        self.assertStatus(200)
+
+    def test_mark_down(self):
+        self._put('/api/osd/0/mark', data={'action': 'down'})
+        self.assertStatus(200)
+
+    def test_reweight(self):
+        self._post('/api/osd/0/reweight', {'weight': 0.4})
+        self.assertStatus(200)
+
+        def get_reweight_value():
+            self._get('/api/osd/0')
+            response = self.jsonBody()
+            if 'osd_map' in response and 'weight' in response['osd_map']:
+                return round(response['osd_map']['weight'], 1)
+            return None
+        self.wait_until_equal(get_reweight_value, 0.4, 10)
+        self.assertStatus(200)
+
+        # Undo
+        self._post('/api/osd/0/reweight', {'weight': 1})
+
+    def test_create_lost_destroy_remove(self):
+        sample_data = {
+            'uuid': 'f860ca2e-757d-48ce-b74a-87052cad563f',
+            'svc_id': 5
+        }
+
+        # Create
+        self._task_post('/api/osd', {
+            'method': 'bare',
+            'data': sample_data,
+            'tracking_id': 'bare-5'
+        })
+        self.assertStatus(201)
+
+        # invalid method
+        self._task_post('/api/osd', {
+            'method': 'xyz',
+            'data': {
+                'uuid': 'f860ca2e-757d-48ce-b74a-87052cad563f',
+                'svc_id': 5
+            },
+            'tracking_id': 'bare-5'
+        })
+        self.assertStatus(400)
+
+        # Lost
+        self._put('/api/osd/5/mark', data={'action': 'lost'})
+        self.assertStatus(200)
+        # Destroy
+        self._post('/api/osd/5/destroy')
+        self.assertStatus(200)
+        # Purge
+        self._post('/api/osd/5/purge')
+        self.assertStatus(200)
+
+    def test_create_with_drive_group(self):
+        data = {
+            'method': 'drive_groups',
+            'data': [
+                {
+                    'service_type': 'osd',
+                    'service_id': 'test',
+                    'host_pattern': '*',
+                    'data_devices': {
+                        'vendor': 'abc',
+                        'model': 'cba',
+                        'rotational': True,
+                        'size': '4 TB'
+                    },
+                    'wal_devices': {
+                        'vendor': 'def',
+                        'model': 'fed',
+                        'rotational': False,
+                        'size': '1 TB'
+                    },
+                    'db_devices': {
+                        'vendor': 'ghi',
+                        'model': 'ihg',
+                        'rotational': False,
+                        'size': '512 GB'
+                    },
+                    'wal_slots': 5,
+                    'db_slots': 5,
+                    'encrypted': True
+                }
+            ],
+            'tracking_id': 'test'
+        }
+        self._post('/api/osd', data)
+        self.assertStatus(201)
+
+    def test_safe_to_destroy(self):
+        osd_dump = json.loads(self._ceph_cmd(['osd', 'dump', '-f', 'json']))
+        max_id = max(map(lambda e: e['osd'], osd_dump['osds']))
+
+        def get_pg_status_equal_unknown(osd_ids):
+            self._get('/api/osd/safe_to_destroy?ids={}'.format(osd_ids))
+            if 'message' in self.jsonBody():
+                return 'pgs have unknown state' in self.jsonBody()['message']
+            return False
+
+        # 1 OSD safe to destroy
+        unused_osd_id = max_id + 10
+        self.wait_until_equal(
+            lambda: get_pg_status_equal_unknown(unused_osd_id), False, 30)
+        self.assertStatus(200)
+        self.assertJsonBody({
+            'is_safe_to_destroy': True,
+            'active': [],
+            'missing_stats': [],
+            'safe_to_destroy': [unused_osd_id],
+            'stored_pgs': [],
+        })
+
+        # multiple OSDs safe to destroy
+        unused_osd_ids = [max_id + 11, max_id + 12]
+        self.wait_until_equal(
+            lambda: get_pg_status_equal_unknown(str(unused_osd_ids)), False, 30)
+        self.assertStatus(200)
+        self.assertJsonBody({
+            'is_safe_to_destroy': True,
+            'active': [],
+            'missing_stats': [],
+            'safe_to_destroy': unused_osd_ids,
+            'stored_pgs': [],
+        })
+
+        # 1 OSD unsafe to destroy
+        def get_destroy_status():
+            self._get('/api/osd/safe_to_destroy?ids=0')
+            if 'is_safe_to_destroy' in self.jsonBody():
+                return self.jsonBody()['is_safe_to_destroy']
+            return None
+        self.wait_until_equal(get_destroy_status, False, 10)
+        self.assertStatus(200)
+
+    def test_osd_devices(self):
+        data = self._get('/api/osd/0/devices')
+        self.assertStatus(200)
+        self.assertSchema(data, devices_schema)
+
+
+class OsdFlagsTest(DashboardTestCase):
+    def __init__(self, *args, **kwargs):
+        super(OsdFlagsTest, self).__init__(*args, **kwargs)
+        self._initial_flags = ['sortbitwise', 'recovery_deletes', 'purged_snapdirs',
+                               'pglog_hardlimit']  # These flags cannot be unset
+
+    @classmethod
+    def _put_flags(cls, flags, ids=None):
+        url = '/api/osd/flags'
+        data = {'flags': flags}
+
+        if ids:
+            url = url + '/individual'
+            data['ids'] = ids
+
+        cls._put(url, data=data)
+        return cls._resp.json()
+
+    def test_list_osd_flags(self):
+        flags = self._get('/api/osd/flags')
+        self.assertStatus(200)
+        self.assertEqual(len(flags), 4)
+        self.assertCountEqual(flags, self._initial_flags)
+
+    def test_add_osd_flag(self):
+        flags = self._put_flags([
+            'sortbitwise', 'recovery_deletes', 'purged_snapdirs', 'noout',
+            'pause', 'pglog_hardlimit'
+        ])
+        self.assertCountEqual(flags, [
+            'sortbitwise', 'recovery_deletes', 'purged_snapdirs', 'noout',
+            'pause', 'pglog_hardlimit'
+        ])
+
+        # Restore flags
+        self._put_flags(self._initial_flags)
+
+    def test_get_indiv_flag(self):
+        initial = self._get('/api/osd/flags/individual')
+        self.assertStatus(200)
+        self.assertSchema(initial, JList(JObj({
+            'osd': int,
+            'flags': JList(str)
+        })))
+
+        self._ceph_cmd(['osd', 'set-group', 'noout,noin', 'osd.0', 'osd.1', 'osd.2'])
+        flags_added = self._get('/api/osd/flags/individual')
+        self.assertStatus(200)
+        for osd in flags_added:
+            if osd['osd'] in [0, 1, 2]:
+                self.assertIn('noout', osd['flags'])
+                self.assertIn('noin', osd['flags'])
+                for osd_initial in initial:
+                    if osd['osd'] == osd_initial['osd']:
+                        self.assertGreater(len(osd['flags']), len(osd_initial['flags']))
+
+        self._ceph_cmd(['osd', 'unset-group', 'noout,noin', 'osd.0', 'osd.1', 'osd.2'])
+        flags_removed = self._get('/api/osd/flags/individual')
+        self.assertStatus(200)
+        for osd in flags_removed:
+            if osd['osd'] in [0, 1, 2]:
+                self.assertNotIn('noout', osd['flags'])
+                self.assertNotIn('noin', osd['flags'])
+
+    def test_add_indiv_flag(self):
+        flags_update = {'noup': None, 'nodown': None, 'noin': None, 'noout': True}
+        svc_id = 0
+
+        resp = self._put_flags(flags_update, [svc_id])
+        self._check_indiv_flags_resp(resp, [svc_id], ['noout'], [], ['noup', 'nodown', 'noin'])
+        self._check_indiv_flags_osd([svc_id], ['noout'], ['noup', 'nodown', 'noin'])
+
+        self._ceph_cmd(['osd', 'unset-group', 'noout', 'osd.{}'.format(svc_id)])
+
+    def test_add_multiple_indiv_flags(self):
+        flags_update = {'noup': None, 'nodown': None, 'noin': True, 'noout': True}
+        svc_id = 0
+
+        resp = self._put_flags(flags_update, [svc_id])
+        self._check_indiv_flags_resp(resp, [svc_id], ['noout', 'noin'], [], ['noup', 'nodown'])
+        self._check_indiv_flags_osd([svc_id], ['noout', 'noin'], ['noup', 'nodown'])
+
+        self._ceph_cmd(['osd', 'unset-group', 'noout,noin', 'osd.{}'.format(svc_id)])
+
+    def test_add_multiple_indiv_flags_multiple_osds(self):
+        flags_update = {'noup': None, 'nodown': None, 'noin': True, 'noout': True}
+        svc_id = [0, 1, 2]
+
+        resp = self._put_flags(flags_update, svc_id)
+        self._check_indiv_flags_resp(resp, svc_id, ['noout', 'noin'], [], ['noup', 'nodown'])
+        self._check_indiv_flags_osd([svc_id], ['noout', 'noin'], ['noup', 'nodown'])
+
+        self._ceph_cmd(['osd', 'unset-group', 'noout,noin', 'osd.0', 'osd.1', 'osd.2'])
+
+    def test_remove_indiv_flag(self):
+        flags_update = {'noup': None, 'nodown': None, 'noin': None, 'noout': False}
+        svc_id = 0
+        self._ceph_cmd(['osd', 'set-group', 'noout', 'osd.{}'.format(svc_id)])
+
+        resp = self._put_flags(flags_update, [svc_id])
+        self._check_indiv_flags_resp(resp, [svc_id], [], ['noout'], ['noup', 'nodown', 'noin'])
+        self._check_indiv_flags_osd([svc_id], [], ['noup', 'nodown', 'noin', 'noout'])
+
+    def test_remove_multiple_indiv_flags(self):
+        flags_update = {'noup': None, 'nodown': None, 'noin': False, 'noout': False}
+        svc_id = 0
+        self._ceph_cmd(['osd', 'set-group', 'noout,noin', 'osd.{}'.format(svc_id)])
+
+        resp = self._put_flags(flags_update, [svc_id])
+        self._check_indiv_flags_resp(resp, [svc_id], [], ['noout', 'noin'], ['noup', 'nodown'])
+        self._check_indiv_flags_osd([svc_id], [], ['noout', 'noin', 'noup', 'nodown'])
+
+    def test_remove_multiple_indiv_flags_multiple_osds(self):
+        flags_update = {'noup': None, 'nodown': None, 'noin': False, 'noout': False}
+        svc_id = [0, 1, 2]
+        self._ceph_cmd(['osd', 'unset-group', 'noout,noin', 'osd.0', 'osd.1', 'osd.2'])
+
+        resp = self._put_flags(flags_update, svc_id)
+        self._check_indiv_flags_resp(resp, svc_id, [], ['noout', 'noin'], ['noup', 'nodown'])
+        self._check_indiv_flags_osd([svc_id], [], ['noout', 'noin', 'noup', 'nodown'])
+
+    def _check_indiv_flags_resp(self, resp, ids, added, removed, ignored):
+        self.assertStatus(200)
+        self.assertCountEqual(resp['ids'], ids)
+        self.assertCountEqual(resp['added'], added)
+        self.assertCountEqual(resp['removed'], removed)
+
+        for flag in ignored:
+            self.assertNotIn(flag, resp['added'])
+            self.assertNotIn(flag, resp['removed'])
+
+    def _check_indiv_flags_osd(self, ids, activated_flags, deactivated_flags):
+        osds = json.loads(self._ceph_cmd(['osd', 'dump', '--format=json']))['osds']
+        for osd in osds:
+            if osd['osd'] in ids:
+                for flag in activated_flags:
+                    self.assertIn(flag, osd['state'])
+                for flag in deactivated_flags:
+                    self.assertNotIn(flag, osd['state'])
diff --git a/qa/tasks/mgr/dashboard/test_perf_counters.py b/qa/tasks/mgr/dashboard/test_perf_counters.py
new file mode 100644
index 000000000..c01368bce
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_perf_counters.py
@@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+from .helper import DashboardTestCase, JObj
+
+
+class PerfCountersControllerTest(DashboardTestCase):
+
+    def test_perf_counters_list(self):
+        data = self._get('/api/perf_counters')
+        self.assertStatus(200)
+
+        self.assertIsInstance(data, dict)
+        for mon in self.mons():
+            self.assertIn('mon.{}'.format(mon), data)
+
+        osds = self.ceph_cluster.mon_manager.get_osd_dump()
+        for osd in osds:
+            self.assertIn('osd.{}'.format(osd['osd']), data)
+
+    def _validate_perf(self, srv_id, srv_type, data, allow_empty):
+        self.assertIsInstance(data, dict)
+        self.assertEqual(srv_type, data['service']['type'])
+        self.assertEqual(str(srv_id), data['service']['id'])
+        self.assertIsInstance(data['counters'], list)
+        if not allow_empty:
+            self.assertGreater(len(data['counters']), 0)
+        for counter in data['counters'][0:1]:
+            self.assertIsInstance(counter, dict)
+            self.assertIn('description', counter)
+            self.assertIn('name', counter)
+            self.assertIn('unit', counter)
+            self.assertIn('value', counter)
+
+    def test_perf_counters_mon_get(self):
+        mon = self.mons()[0]
+        data = self._get('/api/perf_counters/mon/{}'.format(mon))
+        self.assertStatus(200)
+        self._validate_perf(mon, 'mon', data, allow_empty=False)
+
+    def test_perf_counters_mgr_get(self):
+        mgr = list(self.mgr_cluster.mgr_ids)[0]
+        data = self._get('/api/perf_counters/mgr/{}'.format(mgr))
+        self.assertStatus(200)
+        self._validate_perf(mgr, 'mgr', data, allow_empty=False)
+
+    def test_perf_counters_mds_get(self):
+        for mds in self.mds_cluster.mds_ids:
+            data = self._get('/api/perf_counters/mds/{}'.format(mds))
+            self.assertStatus(200)
+            self._validate_perf(mds, 'mds', data, allow_empty=True)
+
+    def test_perf_counters_osd_get(self):
+        for osd in self.ceph_cluster.mon_manager.get_osd_dump():
+            osd = osd['osd']
+            data = self._get('/api/perf_counters/osd/{}'.format(osd))
+            self.assertStatus(200)
+            self._validate_perf(osd, 'osd', data, allow_empty=False)
+
+    def test_perf_counters_not_found(self):
+        osds = self.ceph_cluster.mon_manager.get_osd_dump()
+        unused_id = int(list(map(lambda o: o['osd'], osds)).pop()) + 1
+
+        self._get('/api/perf_counters/osd/{}'.format(unused_id))
+        self.assertStatus(404)
+        schema = JObj(sub_elems={
+            'status': str,
+            'detail': str,
+        }, allow_unknown=True)
+        self.assertEqual(self._resp.json()['detail'], "'osd.{}' not found".format(unused_id))
+        self.assertSchemaBody(schema)
diff --git a/qa/tasks/mgr/dashboard/test_pool.py b/qa/tasks/mgr/dashboard/test_pool.py
new file mode 100644
index 000000000..055ba2b00
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_pool.py
@@ -0,0 +1,433 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+import logging
+import time
+from contextlib import contextmanager
+
+from .helper import DashboardTestCase, JAny, JList, JObj, JUnion
+
+log = logging.getLogger(__name__)
+
+
+class PoolTest(DashboardTestCase):
+    AUTH_ROLES = ['pool-manager']
+
+    pool_schema = JObj(sub_elems={
+        'pool_name': str,
+        'type': str,
+        'application_metadata': JList(str),
+        'flags': int,
+        'flags_names': str,
+    }, allow_unknown=True)
+
+    pool_list_stat_schema = JObj(sub_elems={
+        'latest': JUnion([int, float]),
+        'rate': float,
+        'rates': JList(JAny(none=False)),
+    })
+
+    pool_list_stats_schema = JObj(sub_elems={
+        'avail_raw': pool_list_stat_schema,
+        'bytes_used': pool_list_stat_schema,
+        'max_avail': pool_list_stat_schema,
+        'percent_used': pool_list_stat_schema,
+        'rd_bytes': pool_list_stat_schema,
+        'wr_bytes': pool_list_stat_schema,
+        'rd': pool_list_stat_schema,
+        'wr': pool_list_stat_schema,
+    }, allow_unknown=True)
+
+    pool_rbd_conf_schema = JList(JObj(sub_elems={
+        'name': str,
+        'value': str,
+        'source': int
+    }))
+
+    @contextmanager
+    def __yield_pool(self, name=None, data=None, deletion_name=None):
+        """
+        Use either just a name or whole description of a pool to create one.
+        This also validates the correct creation and deletion after the pool was used.
+
+        :param name: Name of the pool
+        :param data: Describes the pool in full length
+        :param deletion_name: Only needed if the pool was renamed
+        :return:
+        """
+        data = self._create_pool(name, data)
+        yield data
+        self._delete_pool(deletion_name or data['pool'])
+
+    def _create_pool(self, name, data):
+        data = data or {
+            'pool': name,
+            'pg_num': '32',
+            'pool_type': 'replicated',
+            'compression_algorithm': 'snappy',
+            'compression_mode': 'passive',
+            'compression_max_blob_size': '131072',
+            'compression_required_ratio': '0.875',
+            'application_metadata': ['rbd'],
+            'configuration': {
+                'rbd_qos_bps_limit': 1024000,
+                'rbd_qos_iops_limit': 5000,
+            }
+        }
+        self._task_post('/api/pool/', data)
+        self.assertStatus(201)
+        self._validate_pool_properties(data, self._get_pool(data['pool']))
+        return data
+
+    def _delete_pool(self, name):
+        self._task_delete('/api/pool/' + name)
+        self.assertStatus(204)
+
+    def _validate_pool_properties(self, data, pool, timeout=DashboardTestCase.TIMEOUT_HEALTH_CLEAR):
+        # pylint: disable=too-many-branches
+        for prop, value in data.items():
+            if prop == 'pool_type':
+                self.assertEqual(pool['type'], value)
+            elif prop == 'size':
+                self.assertEqual(pool[prop], int(value),
+                                 '{}: {} != {}'.format(prop, pool[prop], value))
+            elif prop == 'pg_num':
+                self._check_pg_num(pool['pool_name'], int(value))
+            elif prop == 'application_metadata':
+                self.assertIsInstance(pool[prop], list)
+                self.assertEqual(value, pool[prop])
+            elif prop == 'pool':
+                self.assertEqual(pool['pool_name'], value)
+            elif prop.startswith('compression'):
+                if value is not None:
+                    if prop.endswith('size'):
+                        value = int(value)
+                    elif prop.endswith('ratio'):
+                        value = float(value)
+                    self.assertEqual(pool['options'][prop], value)
+                else:
+                    self.assertEqual(pool['options'], {})
+            elif prop == 'configuration':
+                # configuration cannot really be checked here for two reasons:
+                #   1.  The default value cannot be given to this method, which becomes relevant
+                #       when resetting a value, because it's not always zero.
+                #   2.  The expected `source` cannot be given to this method, and it cannot
+                #       reliably be determined (see 1)
+                pass
+            else:
+                self.assertEqual(pool[prop], value, '{}: {} != {}'.format(prop, pool[prop], value))
+
+        self.wait_for_health_clear(timeout)
+
+    def _get_pool(self, pool_name):
+        pool = self._get("/api/pool/" + pool_name)
+        self.assertStatus(200)
+        self.assertSchemaBody(self.pool_schema)
+        return pool
+
+    def _check_pg_num(self, pool_name, pg_num):
+        """
+        If both properties have not the same value, the cluster goes into a warning state, which
+        will only happen during a pg update on an existing pool. The test that does that is
+        currently commented out because our QA systems can't deal with the change. Feel free to test
+        it locally.
+        """
+        self.wait_until_equal(
+            lambda: self._get_pool(pool_name)['pg_placement_num'],
+            expect_val=pg_num,
+            timeout=180
+        )
+
+        pool = self._get_pool(pool_name)
+
+        for prop in ['pg_num', 'pg_placement_num']:
+            self.assertEqual(pool[prop], int(pg_num),
+                             '{}: {} != {}'.format(prop, pool[prop], pg_num))
+
+    @DashboardTestCase.RunAs('test', 'test', [{'pool': ['create', 'update', 'delete']}])
+    def test_read_access_permissions(self):
+        self._get('/api/pool')
+        self.assertStatus(403)
+        self._get('/api/pool/bla')
+        self.assertStatus(403)
+
+    @DashboardTestCase.RunAs('test', 'test', [{'pool': ['read', 'update', 'delete']}])
+    def test_create_access_permissions(self):
+        self._task_post('/api/pool/', {})
+        self.assertStatus(403)
+
+    @DashboardTestCase.RunAs('test', 'test', [{'pool': ['read', 'create', 'update']}])
+    def test_delete_access_permissions(self):
+        self._delete('/api/pool/ddd')
+        self.assertStatus(403)
+
+    def test_pool_configuration(self):
+        pool_name = 'device_health_metrics'
+        data = self._get('/api/pool/{}/configuration'.format(pool_name))
+        self.assertStatus(200)
+        self.assertSchema(data, JList(JObj({
+            'name': str,
+            'value': str,
+            'source': int
+        })))
+
+    def test_pool_list(self):
+        data = self._get("/api/pool")
+        self.assertStatus(200)
+
+        cluster_pools = self.ceph_cluster.mon_manager.list_pools()
+        self.assertEqual(len(cluster_pools), len(data))
+        self.assertSchemaBody(JList(self.pool_schema))
+        for pool in data:
+            self.assertNotIn('pg_status', pool)
+            self.assertNotIn('stats', pool)
+            self.assertIn(pool['pool_name'], cluster_pools)
+
+    def test_pool_list_attrs(self):
+        data = self._get("/api/pool?attrs=type,flags")
+        self.assertStatus(200)
+
+        cluster_pools = self.ceph_cluster.mon_manager.list_pools()
+        self.assertEqual(len(cluster_pools), len(data))
+        for pool in data:
+            self.assertIn('pool_name', pool)
+            self.assertIn('type', pool)
+            self.assertIn('flags', pool)
+            self.assertNotIn('flags_names', pool)
+            self.assertNotIn('pg_status', pool)
+            self.assertNotIn('stats', pool)
+            self.assertIn(pool['pool_name'], cluster_pools)
+
+    def test_pool_list_stats(self):
+        data = self._get("/api/pool?stats=true")
+        self.assertStatus(200)
+
+        cluster_pools = self.ceph_cluster.mon_manager.list_pools()
+        self.assertEqual(len(cluster_pools), len(data))
+        self.assertSchemaBody(JList(self.pool_schema))
+        for pool in data:
+            self.assertIn('pool_name', pool)
+            self.assertIn('type', pool)
+            self.assertIn('application_metadata', pool)
+            self.assertIn('flags', pool)
+            self.assertIn('pg_status', pool)
+            self.assertSchema(pool['stats'], self.pool_list_stats_schema)
+            self.assertIn('flags_names', pool)
+            self.assertIn(pool['pool_name'], cluster_pools)
+
+    def test_pool_get(self):
+        cluster_pools = self.ceph_cluster.mon_manager.list_pools()
+        pool = self._get("/api/pool/{}?stats=true&attrs=type,flags,stats"
+                         .format(cluster_pools[0]))
+        self.assertEqual(pool['pool_name'], cluster_pools[0])
+        self.assertIn('type', pool)
+        self.assertIn('flags', pool)
+        self.assertNotIn('pg_status', pool)
+        self.assertSchema(pool['stats'], self.pool_list_stats_schema)
+        self.assertNotIn('flags_names', pool)
+        self.assertSchema(pool['configuration'], self.pool_rbd_conf_schema)
+
+    def test_pool_create_with_two_applications(self):
+        self.__yield_pool(None, {
+            'pool': 'dashboard_pool1',
+            'pg_num': '32',
+            'pool_type': 'replicated',
+            'application_metadata': ['rbd', 'sth'],
+        })
+
+    def test_pool_create_with_ecp_and_rule(self):
+        self._ceph_cmd(['osd', 'crush', 'rule', 'create-erasure', 'ecrule'])
+        self._ceph_cmd(
+            ['osd', 'erasure-code-profile', 'set', 'ecprofile', 'crush-failure-domain=osd'])
+        self.__yield_pool(None, {
+            'pool': 'dashboard_pool2',
+            'pg_num': '32',
+            'pool_type': 'erasure',
+            'application_metadata': ['rbd'],
+            'erasure_code_profile': 'ecprofile',
+            'crush_rule': 'ecrule',
+        })
+        self._ceph_cmd(['osd', 'erasure-code-profile', 'rm', 'ecprofile'])
+
+    def test_pool_create_with_compression(self):
+        pool = {
+            'pool': 'dashboard_pool3',
+            'pg_num': '32',
+            'pool_type': 'replicated',
+            'compression_algorithm': 'zstd',
+            'compression_mode': 'aggressive',
+            'compression_max_blob_size': '10000000',
+            'compression_required_ratio': '0.8',
+            'application_metadata': ['rbd'],
+            'configuration': {
+                'rbd_qos_bps_limit': 2048,
+                'rbd_qos_iops_limit': None,
+            },
+        }
+        with self.__yield_pool(None, pool):
+            expected_configuration = [{
+                'name': 'rbd_qos_bps_limit',
+                'source': 1,
+                'value': '2048',
+            }, {
+                'name': 'rbd_qos_iops_limit',
+                'source': 0,
+                'value': '0',
+            }]
+            new_pool = self._get_pool(pool['pool'])
+            for conf in expected_configuration:
+                self.assertIn(conf, new_pool['configuration'])
+
+    def test_pool_create_with_quotas(self):
+        pools = [
+            {
+                'pool_data': {
+                    'pool': 'dashboard_pool_quota1',
+                    'pg_num': '32',
+                    'pool_type': 'replicated',
+                },
+                'pool_quotas_to_check': {
+                    'quota_max_objects': 0,
+                    'quota_max_bytes': 0,
+                }
+            },
+            {
+                'pool_data': {
+                    'pool': 'dashboard_pool_quota2',
+                    'pg_num': '32',
+                    'pool_type': 'replicated',
+                    'quota_max_objects': 1024,
+                    'quota_max_bytes': 1000,
+                },
+                'pool_quotas_to_check': {
+                    'quota_max_objects': 1024,
+                    'quota_max_bytes': 1000,
+                }
+            }
+        ]
+
+        for pool in pools:
+            pool_name = pool['pool_data']['pool']
+            with self.__yield_pool(pool_name, pool['pool_data']):
+                self._validate_pool_properties(pool['pool_quotas_to_check'],
+                                               self._get_pool(pool_name))
+
+    def test_pool_update_name(self):
+        name = 'pool_update'
+        updated_name = 'pool_updated_name'
+        with self.__yield_pool(name, None, updated_name):
+            props = {'pool': updated_name}
+            self._task_put('/api/pool/{}'.format(name), props)
+            time.sleep(5)
+            self.assertStatus(200)
+            self._validate_pool_properties(props, self._get_pool(updated_name))
+
+    def test_pool_update_metadata(self):
+        pool_name = 'pool_update_metadata'
+        with self.__yield_pool(pool_name):
+            props = {'application_metadata': ['rbd', 'sth']}
+            self._task_put('/api/pool/{}'.format(pool_name), props)
+            self._validate_pool_properties(props, self._get_pool(pool_name),
+                                           self.TIMEOUT_HEALTH_CLEAR * 2)
+
+            properties = {'application_metadata': ['rgw']}
+            self._task_put('/api/pool/' + pool_name, properties)
+            self._validate_pool_properties(properties, self._get_pool(pool_name),
+                                           self.TIMEOUT_HEALTH_CLEAR * 2)
+
+            properties = {'application_metadata': ['rbd', 'sth']}
+            self._task_put('/api/pool/' + pool_name, properties)
+            self._validate_pool_properties(properties, self._get_pool(pool_name),
+                                           self.TIMEOUT_HEALTH_CLEAR * 2)
+
+            properties = {'application_metadata': ['rgw']}
+            self._task_put('/api/pool/' + pool_name, properties)
+            self._validate_pool_properties(properties, self._get_pool(pool_name),
+                                           self.TIMEOUT_HEALTH_CLEAR * 2)
+
+    def test_pool_update_configuration(self):
+        pool_name = 'pool_update_configuration'
+        with self.__yield_pool(pool_name):
+            configuration = {
+                'rbd_qos_bps_limit': 1024,
+                'rbd_qos_iops_limit': None,
+            }
+            expected_configuration = [{
+                'name': 'rbd_qos_bps_limit',
+                'source': 1,
+                'value': '1024',
+            }, {
+                'name': 'rbd_qos_iops_limit',
+                'source': 0,
+                'value': '0',
+            }]
+            self._task_put('/api/pool/' + pool_name, {'configuration': configuration})
+            time.sleep(5)
+            pool_config = self._get_pool(pool_name)['configuration']
+            for conf in expected_configuration:
+                self.assertIn(conf, pool_config)
+
+    def test_pool_update_compression(self):
+        pool_name = 'pool_update_compression'
+        with self.__yield_pool(pool_name):
+            properties = {
+                'compression_algorithm': 'zstd',
+                'compression_mode': 'aggressive',
+                'compression_max_blob_size': '10000000',
+                'compression_required_ratio': '0.8',
+            }
+            self._task_put('/api/pool/' + pool_name, properties)
+            time.sleep(5)
+            self._validate_pool_properties(properties, self._get_pool(pool_name))
+
+    def test_pool_update_unset_compression(self):
+        pool_name = 'pool_update_unset_compression'
+        with self.__yield_pool(pool_name):
+            self._task_put('/api/pool/' + pool_name, {'compression_mode': 'unset'})
+            time.sleep(5)
+            self._validate_pool_properties({
+                'compression_algorithm': None,
+                'compression_mode': None,
+                'compression_max_blob_size': None,
+                'compression_required_ratio': None,
+            }, self._get_pool(pool_name))
+
+    def test_pool_update_quotas(self):
+        pool_name = 'pool_update_quotas'
+        with self.__yield_pool(pool_name):
+            properties = {
+                'quota_max_objects': 1024,
+                'quota_max_bytes': 1000,
+            }
+            self._task_put('/api/pool/' + pool_name, properties)
+            time.sleep(5)
+            self._validate_pool_properties(properties, self._get_pool(pool_name))
+
+    def test_pool_create_fail(self):
+        data = {'pool_type': u'replicated', 'rule_name': u'dnf', 'pg_num': u'8', 'pool': u'sadfs'}
+        self._task_post('/api/pool/', data)
+        self.assertStatus(400)
+        self.assertJsonBody({
+            'component': 'pool',
+            'code': "2",
+            'detail': "[errno -2] specified rule dnf doesn't exist"
+        })
+
+    def test_pool_info(self):
+        self._get("/ui-api/pool/info")
+        self.assertSchemaBody(JObj({
+            'pool_names': JList(str),
+            'compression_algorithms': JList(str),
+            'compression_modes': JList(str),
+            'is_all_bluestore': bool,
+            'bluestore_compression_algorithm': str,
+            'osd_count': int,
+            'crush_rules_replicated': JList(JObj({}, allow_unknown=True)),
+            'crush_rules_erasure': JList(JObj({}, allow_unknown=True)),
+            'pg_autoscale_default_mode': str,
+            'pg_autoscale_modes': JList(str),
+            'erasure_code_profiles': JList(JObj({}, allow_unknown=True)),
+            'used_rules': JObj({}, allow_unknown=True),
+            'used_profiles': JObj({}, allow_unknown=True),
+            'nodes': JList(JObj({}, allow_unknown=True)),
+        }))
diff --git a/qa/tasks/mgr/dashboard/test_rbd.py b/qa/tasks/mgr/dashboard/test_rbd.py
new file mode 100644
index 000000000..997d10f2a
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_rbd.py
@@ -0,0 +1,908 @@
+# -*- coding: utf-8 -*-
+# pylint: disable=too-many-public-methods
+
+from __future__ import absolute_import
+
+import time
+
+from .helper import DashboardTestCase, JLeaf, JList, JObj
+
+
+class RbdTest(DashboardTestCase):
+    AUTH_ROLES = ['pool-manager', 'block-manager', 'cluster-manager']
+    LIST_VERSION = '2.0'
+
+    @DashboardTestCase.RunAs('test', 'test', [{'rbd-image': ['create', 'update', 'delete']}])
+    def test_read_access_permissions(self):
+        self._get('/api/block/image?offset=0&limit=-1&search=&sort=+name',
+                  version=RbdTest.LIST_VERSION)
+        self.assertStatus(403)
+        self.get_image('pool', None, 'image')
+        self.assertStatus(403)
+
+    @DashboardTestCase.RunAs('test', 'test', [{'rbd-image': ['read', 'update', 'delete']}])
+    def test_create_access_permissions(self):
+        self.create_image('pool', None, 'name', 0)
+        self.assertStatus(403)
+        self.create_snapshot('pool', None, 'image', 'snapshot')
+        self.assertStatus(403)
+        self.copy_image('src_pool', None, 'src_image', 'dest_pool', None, 'dest_image')
+        self.assertStatus(403)
+        self.clone_image('parent_pool', None, 'parent_image', 'parent_snap', 'pool', None, 'name')
+        self.assertStatus(403)
+
+    @DashboardTestCase.RunAs('test', 'test', [{'rbd-image': ['read', 'create', 'delete']}])
+    def test_update_access_permissions(self):
+        self.edit_image('pool', None, 'image')
+        self.assertStatus(403)
+        self.update_snapshot('pool', None, 'image', 'snapshot', None, None)
+        self.assertStatus(403)
+        self.rollback_snapshot('rbd', None, 'rollback_img', 'snap1')
+        self.assertStatus(403)
+        self.flatten_image('pool', None, 'image')
+        self.assertStatus(403)
+
+    @DashboardTestCase.RunAs('test', 'test', [{'rbd-image': ['read', 'create', 'update']}])
+    def test_delete_access_permissions(self):
+        self.remove_image('pool', None, 'image')
+        self.assertStatus(403)
+        self.remove_snapshot('pool', None, 'image', 'snapshot')
+        self.assertStatus(403)
+
+    @classmethod
+    def create_namespace(cls, pool, namespace):
+        data = {'namespace': namespace}
+        return cls._post('/api/block/pool/{}/namespace'.format(pool), data)
+
+    @classmethod
+    def remove_namespace(cls, pool, namespace):
+        return cls._delete('/api/block/pool/{}/namespace/{}'.format(pool, namespace))
+
+    @classmethod
+    def create_image(cls, pool, namespace, name, size, **kwargs):
+        data = {'name': name, 'pool_name': pool, 'namespace': namespace, 'size': size}
+        data.update(kwargs)
+        return cls._task_post('/api/block/image', data)
+
+    @classmethod
+    def get_image(cls, pool, namespace, name):
+        namespace = '{}%2F'.format(namespace) if namespace else ''
+        return cls._get('/api/block/image/{}%2F{}{}'.format(pool, namespace, name))
+
+    @classmethod
+    def clone_image(cls, parent_pool, parent_namespace, parent_image, parent_snap, pool, namespace,
+                    name, **kwargs):
+        # pylint: disable=too-many-arguments
+        data = {'child_image_name': name, 'child_namespace': namespace, 'child_pool_name': pool}
+        data.update(kwargs)
+        parent_namespace = '{}%2F'.format(parent_namespace) if parent_namespace else ''
+        return cls._task_post('/api/block/image/{}%2F{}{}/snap/{}/clone'
+                              .format(parent_pool, parent_namespace, parent_image, parent_snap),
+                              data)
+
+    @classmethod
+    def copy_image(cls, src_pool, src_namespace, src_image, dest_pool, dest_namespace, dest_image,
+                   **kwargs):
+        # pylint: disable=too-many-arguments
+        data = {'dest_image_name': dest_image,
+                'dest_pool_name': dest_pool,
+                'dest_namespace': dest_namespace}
+        data.update(kwargs)
+        src_namespace = '{}%2F'.format(src_namespace) if src_namespace else ''
+        return cls._task_post('/api/block/image/{}%2F{}{}/copy'
+                              .format(src_pool, src_namespace, src_image), data)
+
+    @classmethod
+    def remove_image(cls, pool, namespace, image):
+        namespace = '{}%2F'.format(namespace) if namespace else ''
+        return cls._task_delete('/api/block/image/{}%2F{}{}'.format(pool, namespace, image))
+
+    # pylint: disable=too-many-arguments
+    @classmethod
+    def edit_image(cls, pool, namespace, image, name=None, size=None, features=None, **kwargs):
+        kwargs.update({'name': name, 'size': size, 'features': features})
+        namespace = '{}%2F'.format(namespace) if namespace else ''
+        return cls._task_put('/api/block/image/{}%2F{}{}'.format(pool, namespace, image), kwargs)
+
+    @classmethod
+    def flatten_image(cls, pool, namespace, image):
+        namespace = '{}%2F'.format(namespace) if namespace else ''
+        return cls._task_post('/api/block/image/{}%2F{}{}/flatten'.format(pool, namespace, image))
+
+    @classmethod
+    def create_snapshot(cls, pool, namespace, image, snapshot):
+        namespace = '{}%2F'.format(namespace) if namespace else ''
+        return cls._task_post('/api/block/image/{}%2F{}{}/snap'.format(pool, namespace, image),
+                              {'snapshot_name': snapshot})
+
+    @classmethod
+    def remove_snapshot(cls, pool, namespace, image, snapshot):
+        namespace = '{}%2F'.format(namespace) if namespace else ''
+        return cls._task_delete('/api/block/image/{}%2F{}{}/snap/{}'.format(pool, namespace, image,
+                                                                            snapshot))
+
+    @classmethod
+    def update_snapshot(cls, pool, namespace, image, snapshot, new_name, is_protected):
+        namespace = '{}%2F'.format(namespace) if namespace else ''
+        return cls._task_put('/api/block/image/{}%2F{}{}/snap/{}'.format(pool, namespace, image,
+                                                                         snapshot),
+                             {'new_snap_name': new_name, 'is_protected': is_protected})
+
+    @classmethod
+    def rollback_snapshot(cls, pool, namespace, image, snapshot):
+        namespace = '{}%2F'.format(namespace) if namespace else ''
+        return cls._task_post('/api/block/image/{}%2F{}{}/snap/{}/rollback'.format(pool,
+                                                                                   namespace,
+                                                                                   image,
+                                                                                   snapshot))
+
+    @classmethod
+    def setUpClass(cls):
+        super(RbdTest, cls).setUpClass()
+        cls.create_pool('rbd', 2**3, 'replicated')
+        cls.create_pool('rbd_iscsi', 2**3, 'replicated')
+
+        cls.create_image('rbd', None, 'img1', 2**30)
+        cls.create_image('rbd', None, 'img2', 2*2**30)
+        cls.create_image('rbd_iscsi', None, 'img1', 2**30)
+        cls.create_image('rbd_iscsi', None, 'img2', 2*2**30)
+
+        osd_metadata = cls.ceph_cluster.mon_manager.get_osd_metadata()
+        cls.bluestore_support = True
+        for osd in osd_metadata:
+            if osd['osd_objectstore'] != 'bluestore':
+                cls.bluestore_support = False
+                break
+
+    @classmethod
+    def tearDownClass(cls):
+        super(RbdTest, cls).tearDownClass()
+        cls._ceph_cmd(['osd', 'pool', 'delete', 'rbd', 'rbd', '--yes-i-really-really-mean-it'])
+        cls._ceph_cmd(['osd', 'pool', 'delete', 'rbd_iscsi', 'rbd_iscsi',
+                       '--yes-i-really-really-mean-it'])
+        cls._ceph_cmd(['osd', 'pool', 'delete', 'rbd_data', 'rbd_data',
+                       '--yes-i-really-really-mean-it'])
+
+    def create_image_in_trash(self, pool, name, delay=0):
+        self.create_image(pool, None, name, 10240)
+        img = self._get('/api/block/image/{}%2F{}'.format(pool, name))
+
+        self._task_post("/api/block/image/{}%2F{}/move_trash".format(pool, name),
+                        {'delay': delay})
+        self.assertStatus([200, 201])
+        return img['id']
+
+    @classmethod
+    def remove_trash(cls, pool, image_id, force=False):
+        return cls._task_delete('/api/block/image/trash/{}%2F{}/?force={}'.format(
+            pool, image_id, force))
+
+    @classmethod
+    def restore_trash(cls, pool, namespace, image_id, new_image_name):
+        data = {'new_image_name': new_image_name}
+        namespace = '{}%2F'.format(namespace) if namespace else ''
+        return cls._task_post('/api/block/image/trash/{}%2F{}{}/restore'.format(pool,
+                                                                                namespace,
+                                                                                image_id), data)
+
+    @classmethod
+    def purge_trash(cls, pool):
+        return cls._task_post('/api/block/image/trash/purge?pool_name={}'.format(pool))
+
+    @classmethod
+    def get_trash(cls, pool, image_id):
+        trash = cls._get('/api/block/image/trash/?pool_name={}'.format(pool))
+        if isinstance(trash, list):
+            for trash_pool in trash:
+                for image in trash_pool['value']:
+                    if image['id'] == image_id:
+                        return image
+
+        return None
+
+    def _validate_image(self, img, **kwargs):
+        """
+        Example of an RBD image json:
+
+        {
+            "size": 1073741824,
+            "obj_size": 4194304,
+            "mirror_mode": "journal",
+            "num_objs": 256,
+            "order": 22,
+            "block_name_prefix": "rbd_data.10ae2ae8944a",
+            "name": "img1",
+            "pool_name": "rbd",
+            "features": 61,
+            "features_name": ["deep-flatten", "exclusive-lock", "fast-diff", "layering",
+                              "object-map"]
+        }
+        """
+        schema = JObj(sub_elems={
+            'size': JLeaf(int),
+            'obj_size': JLeaf(int),
+            'num_objs': JLeaf(int),
+            'order': JLeaf(int),
+            'block_name_prefix': JLeaf(str),
+            'name': JLeaf(str),
+            'id': JLeaf(str),
+            'unique_id': JLeaf(str),
+            'image_format': JLeaf(int),
+            'pool_name': JLeaf(str),
+            'namespace': JLeaf(str, none=True),
+            'features': JLeaf(int),
+            'features_name': JList(JLeaf(str)),
+            'stripe_count': JLeaf(int, none=True),
+            'stripe_unit': JLeaf(int, none=True),
+            'parent': JObj(sub_elems={'pool_name': JLeaf(str),
+                                      'pool_namespace': JLeaf(str, none=True),
+                                      'image_name': JLeaf(str),
+                                      'snap_name': JLeaf(str)}, none=True),
+            'data_pool': JLeaf(str, none=True),
+            'snapshots': JList(JLeaf(dict)),
+            'timestamp': JLeaf(str, none=True),
+            'disk_usage': JLeaf(int, none=True),
+            'total_disk_usage': JLeaf(int, none=True),
+            'configuration': JList(JObj(sub_elems={
+                'name': JLeaf(str),
+                'source': JLeaf(int),
+                'value': JLeaf(str),
+            })),
+            'mirror_mode': JLeaf(str),
+        })
+        self.assertSchema(img, schema)
+
+        for k, v in kwargs.items():
+            if isinstance(v, list):
+                self.assertSetEqual(set(img[k]), set(v))
+            else:
+                self.assertEqual(img[k], v)
+
+    def _validate_snapshot(self, snap, **kwargs):
+        self.assertIn('id', snap)
+        self.assertIn('name', snap)
+        self.assertIn('is_protected', snap)
+        self.assertIn('timestamp', snap)
+        self.assertIn('size', snap)
+        self.assertIn('children', snap)
+
+        for k, v in kwargs.items():
+            if isinstance(v, list):
+                self.assertSetEqual(set(snap[k]), set(v))
+            else:
+                self.assertEqual(snap[k], v)
+
+    def _validate_snapshot_list(self, snap_list, snap_name=None, **kwargs):
+        found = False
+        for snap in snap_list:
+            self.assertIn('name', snap)
+            if snap_name and snap['name'] == snap_name:
+                found = True
+                self._validate_snapshot(snap, **kwargs)
+                break
+        if snap_name and not found:
+            self.fail("Snapshot {} not found".format(snap_name))
+
+    def test_list(self):
+        data = self._get('/api/block/image?offset=0&limit=-1&search=&sort=+name',
+                         version=RbdTest.LIST_VERSION)
+        self.assertStatus(200)
+        self.assertEqual(len(data), 2)
+
+        for pool_view in data:
+            self.assertIsNotNone(pool_view['value'])
+            self.assertIn('pool_name', pool_view)
+            self.assertIn(pool_view['pool_name'], ['rbd', 'rbd_iscsi'])
+            image_list = pool_view['value']
+            self.assertEqual(len(image_list), 2)
+
+            for img in image_list:
+                self.assertIn('name', img)
+                self.assertIn('pool_name', img)
+                self.assertIn(img['pool_name'], ['rbd', 'rbd_iscsi'])
+                if img['name'] == 'img1':
+                    self._validate_image(img, size=1073741824,
+                                         num_objs=256, obj_size=4194304,
+                                         features_name=['deep-flatten',
+                                                        'exclusive-lock',
+                                                        'fast-diff',
+                                                        'layering',
+                                                        'object-map'])
+                elif img['name'] == 'img2':
+                    self._validate_image(img, size=2147483648,
+                                         num_objs=512, obj_size=4194304,
+                                         features_name=['deep-flatten',
+                                                        'exclusive-lock',
+                                                        'fast-diff',
+                                                        'layering',
+                                                        'object-map'])
+                else:
+                    assert False, "Unexcepted image '{}' in result list".format(img['name'])
+
+    def test_create(self):
+        rbd_name = 'test_rbd'
+        self.create_image('rbd', None, rbd_name, 10240)
+        self.assertStatus(201)
+
+        img = self.get_image('rbd', None, 'test_rbd')
+        self.assertStatus(200)
+
+        self._validate_image(img, name=rbd_name, size=10240,
+                             num_objs=1, obj_size=4194304,
+                             features_name=['deep-flatten',
+                                            'exclusive-lock',
+                                            'fast-diff', 'layering',
+                                            'object-map'])
+
+        self.remove_image('rbd', None, rbd_name)
+
+    def test_create_with_configuration(self):
+        pool = 'rbd'
+        image_name = 'image_with_config'
+        size = 10240
+        configuration = {
+            'rbd_qos_bps_limit': 10240,
+            'rbd_qos_bps_burst': 10240 * 2,
+        }
+        expected = [{
+            'name': 'rbd_qos_bps_limit',
+            'source': 2,
+            'value': str(10240),
+        }, {
+            'name': 'rbd_qos_bps_burst',
+            'source': 2,
+            'value': str(10240 * 2),
+        }]
+
+        self.create_image(pool, None, image_name, size, configuration=configuration)
+        self.assertStatus(201)
+        img = self.get_image('rbd', None, image_name)
+        self.assertStatus(200)
+        for conf in expected:
+            self.assertIn(conf, img['configuration'])
+
+        self.remove_image(pool, None, image_name)
+
+    def test_create_rbd_in_data_pool(self):
+        if not self.bluestore_support:
+            self.skipTest('requires bluestore cluster')
+
+        self.create_pool('data_pool', 2**4, 'erasure')
+
+        rbd_name = 'test_rbd_in_data_pool'
+        self.create_image('rbd', None, rbd_name, 10240, data_pool='data_pool')
+        self.assertStatus(201)
+
+        img = self.get_image('rbd', None, 'test_rbd_in_data_pool')
+        self.assertStatus(200)
+
+        self._validate_image(img, name=rbd_name, size=10240,
+                             num_objs=1, obj_size=4194304,
+                             data_pool='data_pool',
+                             features_name=['data-pool', 'deep-flatten',
+                                            'exclusive-lock',
+                                            'fast-diff', 'layering',
+                                            'object-map'])
+
+        self.remove_image('rbd', None, rbd_name)
+        self.assertStatus(204)
+        self._ceph_cmd(['osd', 'pool', 'delete', 'data_pool', 'data_pool',
+                        '--yes-i-really-really-mean-it'])
+
+    def test_create_rbd_twice(self):
+        res = self.create_image('rbd', None, 'test_rbd_twice', 10240)
+
+        res = self.create_image('rbd', None, 'test_rbd_twice', 10240)
+        self.assertStatus(400)
+        self.assertEqual(res, {"code": '17', 'status': 400, "component": "rbd",
+                               "detail": "[errno 17] RBD image already exists (error creating "
+                                         "image)",
+                               'task': {'name': 'rbd/create',
+                                        'metadata': {'pool_name': 'rbd', 'namespace': None,
+                                                     'image_name': 'test_rbd_twice'}}})
+        self.remove_image('rbd', None, 'test_rbd_twice')
+        self.assertStatus(204)
+
+    def test_snapshots_and_clone_info(self):
+        self.create_snapshot('rbd', None, 'img1', 'snap1')
+        self.create_snapshot('rbd', None, 'img1', 'snap2')
+        self._rbd_cmd(['snap', 'protect', 'rbd/img1@snap1'])
+        self._rbd_cmd(['clone', 'rbd/img1@snap1', 'rbd_iscsi/img1_clone'])
+
+        img = self.get_image('rbd', None, 'img1')
+        self.assertStatus(200)
+        self._validate_image(img, name='img1', size=1073741824,
+                             num_objs=256, obj_size=4194304, parent=None,
+                             features_name=['deep-flatten', 'exclusive-lock',
+                                            'fast-diff', 'layering',
+                                            'object-map'])
+        for snap in img['snapshots']:
+            if snap['name'] == 'snap1':
+                self._validate_snapshot(snap, is_protected=True)
+                self.assertEqual(len(snap['children']), 1)
+                self.assertDictEqual(snap['children'][0],
+                                     {'pool_name': 'rbd_iscsi',
+                                      'image_name': 'img1_clone'})
+            elif snap['name'] == 'snap2':
+                self._validate_snapshot(snap, is_protected=False)
+
+        img = self.get_image('rbd_iscsi', None, 'img1_clone')
+        self.assertStatus(200)
+        self._validate_image(img, name='img1_clone', size=1073741824,
+                             num_objs=256, obj_size=4194304,
+                             parent={'pool_name': 'rbd', 'pool_namespace': '',
+                                     'image_name': 'img1', 'snap_name': 'snap1'},
+                             features_name=['deep-flatten', 'exclusive-lock',
+                                            'fast-diff', 'layering',
+                                            'object-map'])
+        self.remove_image('rbd_iscsi', None, 'img1_clone')
+        self.assertStatus(204)
+
+    def test_disk_usage(self):
+        self._rbd_cmd(['bench', '--io-type', 'write', '--io-total', '50M', 'rbd/img2'])
+        self.create_snapshot('rbd', None, 'img2', 'snap1')
+        self._rbd_cmd(['bench', '--io-type', 'write', '--io-total', '20M', 'rbd/img2'])
+        self.create_snapshot('rbd', None, 'img2', 'snap2')
+        self._rbd_cmd(['bench', '--io-type', 'write', '--io-total', '10M', 'rbd/img2'])
+        self.create_snapshot('rbd', None, 'img2', 'snap3')
+        self._rbd_cmd(['bench', '--io-type', 'write', '--io-total', '5M', 'rbd/img2'])
+        img = self.get_image('rbd', None, 'img2')
+        self.assertStatus(200)
+        self._validate_image(img, name='img2', size=2147483648,
+                             total_disk_usage=268435456, disk_usage=67108864)
+
+    def test_delete_non_existent_image(self):
+        res = self.remove_image('rbd', None, 'i_dont_exist')
+        self.assertStatus(404)
+        self.assertEqual(res, {u'code': 404, "status": 404, "component": None,
+                               "detail": "(404, 'Image not found')",
+                               'task': {'name': 'rbd/delete',
+                                        'metadata': {'image_spec': 'rbd/i_dont_exist'}}})
+
+    def test_image_delete(self):
+        self.create_image('rbd', None, 'delete_me', 2**30)
+        self.assertStatus(201)
+        self.create_snapshot('rbd', None, 'delete_me', 'snap1')
+        self.assertStatus(201)
+        self.create_snapshot('rbd', None, 'delete_me', 'snap2')
+        self.assertStatus(201)
+
+        img = self.get_image('rbd', None, 'delete_me')
+        self.assertStatus(200)
+        self._validate_image(img, name='delete_me', size=2**30)
+        self.assertEqual(len(img['snapshots']), 2)
+
+        self.remove_snapshot('rbd', None, 'delete_me', 'snap1')
+        self.assertStatus(204)
+        self.remove_snapshot('rbd', None, 'delete_me', 'snap2')
+        self.assertStatus(204)
+
+        img = self.get_image('rbd', None, 'delete_me')
+        self.assertStatus(200)
+        self._validate_image(img, name='delete_me', size=2**30)
+        self.assertEqual(len(img['snapshots']), 0)
+
+        self.remove_image('rbd', None, 'delete_me')
+        self.assertStatus(204)
+
+    def test_image_delete_with_snapshot(self):
+        self.create_image('rbd', None, 'delete_me', 2**30)
+        self.assertStatus(201)
+        self.create_snapshot('rbd', None, 'delete_me', 'snap1')
+        self.assertStatus(201)
+        self.create_snapshot('rbd', None, 'delete_me', 'snap2')
+        self.assertStatus(201)
+
+        img = self.get_image('rbd', None, 'delete_me')
+        self.assertStatus(200)
+        self._validate_image(img, name='delete_me', size=2**30)
+        self.assertEqual(len(img['snapshots']), 2)
+
+        self.remove_image('rbd', None, 'delete_me')
+        self.assertStatus(204)
+
+    def test_image_rename(self):
+        self.create_image('rbd', None, 'edit_img', 2**30)
+        self.assertStatus(201)
+        self.get_image('rbd', None, 'edit_img')
+        self.assertStatus(200)
+        self.edit_image('rbd', None, 'edit_img', 'new_edit_img')
+        self.assertStatus(200)
+        self.get_image('rbd', None, 'edit_img')
+        self.assertStatus(404)
+        self.get_image('rbd', None, 'new_edit_img')
+        self.assertStatus(200)
+        self.remove_image('rbd', None, 'new_edit_img')
+        self.assertStatus(204)
+
+    def test_image_resize(self):
+        self.create_image('rbd', None, 'edit_img', 2**30)
+        self.assertStatus(201)
+        img = self.get_image('rbd', None, 'edit_img')
+        self.assertStatus(200)
+        self._validate_image(img, size=2**30)
+        self.edit_image('rbd', None, 'edit_img', size=2*2**30)
+        self.assertStatus(200)
+        img = self.get_image('rbd', None, 'edit_img')
+        self.assertStatus(200)
+        self._validate_image(img, size=2*2**30)
+        self.remove_image('rbd', None, 'edit_img')
+        self.assertStatus(204)
+
+    def test_image_change_features(self):
+        self.create_image('rbd', None, 'edit_img', 2**30, features=["layering"])
+        self.assertStatus(201)
+        img = self.get_image('rbd', None, 'edit_img')
+        self.assertStatus(200)
+        self._validate_image(img, features_name=["layering"])
+        self.edit_image('rbd', None, 'edit_img',
+                        features=["fast-diff", "object-map", "exclusive-lock"])
+        self.assertStatus(200)
+        img = self.get_image('rbd', None, 'edit_img')
+        self.assertStatus(200)
+        self._validate_image(img, features_name=['exclusive-lock',
+                                                 'fast-diff', 'layering',
+                                                 'object-map'])
+        self.edit_image('rbd', None, 'edit_img',
+                        features=["journaling", "exclusive-lock"])
+        self.assertStatus(200)
+        img = self.get_image('rbd', None, 'edit_img')
+        self.assertStatus(200)
+        self._validate_image(img, features_name=['exclusive-lock',
+                                                 'journaling', 'layering'])
+        self.remove_image('rbd', None, 'edit_img')
+        self.assertStatus(204)
+
+    def test_image_change_config(self):
+        pool = 'rbd'
+        image = 'image_with_config'
+        initial_conf = {
+            'rbd_qos_bps_limit': 10240,
+            'rbd_qos_write_iops_limit': None
+        }
+        initial_expect = [{
+            'name': 'rbd_qos_bps_limit',
+            'source': 2,
+            'value': '10240',
+        }, {
+            'name': 'rbd_qos_write_iops_limit',
+            'source': 0,
+            'value': '0',
+        }]
+        new_conf = {
+            'rbd_qos_bps_limit': 0,
+            'rbd_qos_bps_burst': 20480,
+            'rbd_qos_write_iops_limit': None
+        }
+        new_expect = [{
+            'name': 'rbd_qos_bps_limit',
+            'source': 2,
+            'value': '0',
+        }, {
+            'name': 'rbd_qos_bps_burst',
+            'source': 2,
+            'value': '20480',
+        }, {
+            'name': 'rbd_qos_write_iops_limit',
+            'source': 0,
+            'value': '0',
+        }]
+
+        self.create_image(pool, None, image, 2**30, configuration=initial_conf)
+        self.assertStatus(201)
+        img = self.get_image(pool, None, image)
+        self.assertStatus(200)
+        for conf in initial_expect:
+            self.assertIn(conf, img['configuration'])
+
+        self.edit_image(pool, None, image, configuration=new_conf)
+        img = self.get_image(pool, None, image)
+        self.assertStatus(200)
+        for conf in new_expect:
+            self.assertIn(conf, img['configuration'])
+
+        self.remove_image(pool, None, image)
+        self.assertStatus(204)
+
+    def test_update_snapshot(self):
+        self.create_snapshot('rbd', None, 'img1', 'snap5')
+        self.assertStatus(201)
+        img = self.get_image('rbd', None, 'img1')
+        self._validate_snapshot_list(img['snapshots'], 'snap5', is_protected=False)
+
+        self.update_snapshot('rbd', None, 'img1', 'snap5', 'snap6', None)
+        self.assertStatus(200)
+        img = self.get_image('rbd', None, 'img1')
+        self._validate_snapshot_list(img['snapshots'], 'snap6', is_protected=False)
+
+        self.update_snapshot('rbd', None, 'img1', 'snap6', None, True)
+        self.assertStatus(200)
+        img = self.get_image('rbd', None, 'img1')
+        self._validate_snapshot_list(img['snapshots'], 'snap6', is_protected=True)
+
+        self.update_snapshot('rbd', None, 'img1', 'snap6', 'snap5', False)
+        self.assertStatus(200)
+        img = self.get_image('rbd', None, 'img1')
+        self._validate_snapshot_list(img['snapshots'], 'snap5', is_protected=False)
+
+        self.remove_snapshot('rbd', None, 'img1', 'snap5')
+        self.assertStatus(204)
+
+    def test_snapshot_rollback(self):
+        self.create_image('rbd', None, 'rollback_img', 2**30,
+                          features=["layering", "exclusive-lock", "fast-diff",
+                                    "object-map"])
+        self.assertStatus(201)
+        self.create_snapshot('rbd', None, 'rollback_img', 'snap1')
+        self.assertStatus(201)
+
+        img = self.get_image('rbd', None, 'rollback_img')
+        self.assertStatus(200)
+        self.assertEqual(img['disk_usage'], 0)
+
+        self._rbd_cmd(['bench', '--io-type', 'write', '--io-total', '5M',
+                       'rbd/rollback_img'])
+
+        img = self.get_image('rbd', None, 'rollback_img')
+        self.assertStatus(200)
+        self.assertGreater(img['disk_usage'], 0)
+
+        self.rollback_snapshot('rbd', None, 'rollback_img', 'snap1')
+        self.assertStatus([201, 200])
+
+        img = self.get_image('rbd', None, 'rollback_img')
+        self.assertStatus(200)
+        self.assertEqual(img['disk_usage'], 0)
+
+        self.remove_snapshot('rbd', None, 'rollback_img', 'snap1')
+        self.assertStatus(204)
+        self.remove_image('rbd', None, 'rollback_img')
+        self.assertStatus(204)
+
+    def test_clone(self):
+        self.create_image('rbd', None, 'cimg', 2**30, features=["layering"])
+        self.assertStatus(201)
+        self.create_snapshot('rbd', None, 'cimg', 'snap1')
+        self.assertStatus(201)
+        self.update_snapshot('rbd', None, 'cimg', 'snap1', None, True)
+        self.assertStatus(200)
+        self.clone_image('rbd', None, 'cimg', 'snap1', 'rbd', None, 'cimg-clone',
+                         features=["layering", "exclusive-lock", "fast-diff",
+                                   "object-map"])
+        self.assertStatus([200, 201])
+
+        img = self.get_image('rbd', None, 'cimg-clone')
+        self.assertStatus(200)
+        self._validate_image(img, features_name=['exclusive-lock',
+                                                 'fast-diff', 'layering',
+                                                 'object-map'],
+                             parent={'pool_name': 'rbd', 'pool_namespace': '',
+                                     'image_name': 'cimg', 'snap_name': 'snap1'})
+
+        res = self.remove_image('rbd', None, 'cimg')
+        self.assertStatus(400)
+        self.assertIn('code', res)
+        self.assertEqual(res['code'], '16')
+
+        self.remove_image('rbd', None, 'cimg-clone')
+        self.assertStatus(204)
+        self.remove_image('rbd', None, 'cimg')
+        self.assertStatus(204)
+
+    def test_copy(self):
+        self.create_image('rbd', None, 'coimg', 2**30,
+                          features=["layering", "exclusive-lock", "fast-diff",
+                                    "object-map"])
+        self.assertStatus(201)
+
+        self._rbd_cmd(['bench', '--io-type', 'write', '--io-total', '5M',
+                       'rbd/coimg'])
+
+        self.copy_image('rbd', None, 'coimg', 'rbd_iscsi', None, 'coimg-copy',
+                        features=["layering", "fast-diff", "exclusive-lock",
+                                  "object-map"])
+        self.assertStatus([200, 201])
+
+        img = self.get_image('rbd', None, 'coimg')
+        self.assertStatus(200)
+        self._validate_image(img, features_name=['layering', 'exclusive-lock',
+                                                 'fast-diff', 'object-map'])
+
+        img_copy = self.get_image('rbd_iscsi', None, 'coimg-copy')
+        self._validate_image(img_copy, features_name=['exclusive-lock',
+                                                      'fast-diff', 'layering',
+                                                      'object-map'],
+                             disk_usage=img['disk_usage'])
+
+        self.remove_image('rbd', None, 'coimg')
+        self.assertStatus(204)
+        self.remove_image('rbd_iscsi', None, 'coimg-copy')
+        self.assertStatus(204)
+
+    def test_flatten(self):
+        self.create_snapshot('rbd', None, 'img1', 'snapf')
+        self.update_snapshot('rbd', None, 'img1', 'snapf', None, True)
+        self.clone_image('rbd', None, 'img1', 'snapf', 'rbd_iscsi', None, 'img1_snapf_clone')
+
+        img = self.get_image('rbd_iscsi', None, 'img1_snapf_clone')
+        self.assertStatus(200)
+        self.assertIsNotNone(img['parent'])
+
+        self.flatten_image('rbd_iscsi', None, 'img1_snapf_clone')
+        self.assertStatus([200, 201])
+
+        img = self.get_image('rbd_iscsi', None, 'img1_snapf_clone')
+        self.assertStatus(200)
+        self.assertIsNone(img['parent'])
+
+        self.update_snapshot('rbd', None, 'img1', 'snapf', None, False)
+        self.remove_snapshot('rbd', None, 'img1', 'snapf')
+        self.assertStatus(204)
+
+        self.remove_image('rbd_iscsi', None, 'img1_snapf_clone')
+        self.assertStatus(204)
+
+    def test_default_features(self):
+        default_features = self._get('/api/block/image/default_features')
+        self.assertEqual(default_features, [
+            'deep-flatten', 'exclusive-lock', 'fast-diff', 'layering', 'object-map'])
+
+    def test_clone_format_version(self):
+        config_name = 'rbd_default_clone_format'
+
+        def _get_config_by_name(conf_name):
+            data = self._get('/api/cluster_conf/{}'.format(conf_name))
+            if 'value' in data:
+                return data['value']
+            return None
+
+        # with rbd_default_clone_format = auto
+        clone_format_version = self._get('/api/block/image/clone_format_version')
+        self.assertEqual(clone_format_version, 1)
+        self.assertStatus(200)
+
+        # with rbd_default_clone_format = 1
+        value = [{'section': "global", 'value': "1"}]
+        self._post('/api/cluster_conf', {
+            'name': config_name,
+            'value': value
+        })
+        self.wait_until_equal(
+            lambda: _get_config_by_name(config_name),
+            value,
+            timeout=60)
+        clone_format_version = self._get('/api/block/image/clone_format_version')
+        self.assertEqual(clone_format_version, 1)
+        self.assertStatus(200)
+
+        # with rbd_default_clone_format = 2
+        value = [{'section': "global", 'value': "2"}]
+        self._post('/api/cluster_conf', {
+            'name': config_name,
+            'value': value
+        })
+        self.wait_until_equal(
+            lambda: _get_config_by_name(config_name),
+            value,
+            timeout=60)
+        clone_format_version = self._get('/api/block/image/clone_format_version')
+        self.assertEqual(clone_format_version, 2)
+        self.assertStatus(200)
+
+        value = []
+        self._post('/api/cluster_conf', {
+            'name': config_name,
+            'value': value
+        })
+        self.wait_until_equal(
+            lambda: _get_config_by_name(config_name),
+            None,
+            timeout=60)
+
+    def test_image_with_namespace(self):
+        self.create_namespace('rbd', 'ns')
+        self.create_image('rbd', 'ns', 'test', 10240)
+        self.assertStatus(201)
+
+        img = self.get_image('rbd', 'ns', 'test')
+        self.assertStatus(200)
+
+        self._validate_image(img, name='test', size=10240,
+                             pool_name='rbd', namespace='ns',
+                             num_objs=1, obj_size=4194304,
+                             features_name=['deep-flatten',
+                                            'exclusive-lock',
+                                            'fast-diff', 'layering',
+                                            'object-map'])
+
+        self.remove_image('rbd', 'ns', 'test')
+        self.remove_namespace('rbd', 'ns')
+
+    def test_move_image_to_trash(self):
+        img_id = self.create_image_in_trash('rbd', 'test_rbd')
+
+        self.get_image('rbd', None, 'test_rbd')
+        self.assertStatus(404)
+
+        time.sleep(1)
+
+        image = self.get_trash('rbd', img_id)
+        self.assertIsNotNone(image)
+
+        self.remove_trash('rbd', img_id)
+
+    def test_list_trash(self):
+        img_id = self.create_image_in_trash('rbd', 'test_rbd', 0)
+        data = self._get('/api/block/image/trash/?pool_name={}'.format('rbd'))
+        self.assertStatus(200)
+        self.assertIsInstance(data, list)
+        self.assertIsNotNone(data)
+
+        self.remove_trash('rbd', img_id)
+        self.assertStatus(204)
+
+    def test_restore_trash(self):
+        img_id = self.create_image_in_trash('rbd', 'test_rbd')
+
+        self.restore_trash('rbd', None, img_id, 'test_rbd')
+
+        self.get_image('rbd', None, 'test_rbd')
+        self.assertStatus(200)
+
+        image = self.get_trash('rbd', img_id)
+        self.assertIsNone(image)
+
+        self.remove_image('rbd', None, 'test_rbd')
+
+    def test_remove_expired_trash(self):
+        img_id = self.create_image_in_trash('rbd', 'test_rbd', 0)
+        self.remove_trash('rbd', img_id, False)
+        self.assertStatus(204)
+
+        image = self.get_trash('rbd', img_id)
+        self.assertIsNone(image)
+
+    def test_remove_not_expired_trash(self):
+        img_id = self.create_image_in_trash('rbd', 'test_rbd', 9999)
+        self.remove_trash('rbd', img_id, False)
+        self.assertStatus(400)
+
+        time.sleep(1)
+
+        image = self.get_trash('rbd', img_id)
+        self.assertIsNotNone(image)
+
+        self.remove_trash('rbd', img_id, True)
+
+    def test_remove_not_expired_trash_with_force(self):
+        img_id = self.create_image_in_trash('rbd', 'test_rbd', 9999)
+        self.remove_trash('rbd', img_id, True)
+        self.assertStatus(204)
+
+        image = self.get_trash('rbd', img_id)
+        self.assertIsNone(image)
+
+    def test_purge_trash(self):
+        id_expired = self.create_image_in_trash('rbd', 'test_rbd_expired', 0)
+        id_not_expired = self.create_image_in_trash('rbd', 'test_rbd', 9999)
+
+        time.sleep(1)
+
+        self.purge_trash('rbd')
+        self.assertStatus([200, 201])
+
+        time.sleep(1)
+
+        trash_not_expired = self.get_trash('rbd', id_not_expired)
+        self.assertIsNotNone(trash_not_expired)
+
+        self.wait_until_equal(lambda: self.get_trash('rbd', id_expired), None, 60)
+
+    def test_list_namespaces(self):
+        self.create_namespace('rbd', 'ns')
+
+        namespaces = self._get('/api/block/pool/rbd/namespace')
+        self.assertStatus(200)
+        self.assertEqual(len(namespaces), 1)
+
+        self.remove_namespace('rbd', 'ns')
diff --git a/qa/tasks/mgr/dashboard/test_rbd_mirroring.py b/qa/tasks/mgr/dashboard/test_rbd_mirroring.py
new file mode 100644
index 000000000..b6a86e405
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_rbd_mirroring.py
@@ -0,0 +1,195 @@
+# -*- coding: utf-8 -*-
+# pylint: disable=too-many-public-methods
+
+from __future__ import absolute_import
+
+from .helper import DashboardTestCase
+
+
+class RbdMirroringTest(DashboardTestCase):
+    AUTH_ROLES = ['pool-manager', 'block-manager']
+
+    @classmethod
+    def get_pool(cls, pool):
+        data = cls._get('/api/block/mirroring/pool/{}'.format(pool))
+        if isinstance(data, dict):
+            return data
+        return {}
+
+    @classmethod
+    def update_pool(cls, pool, mirror_mode):
+        data = {'mirror_mode': mirror_mode}
+        return cls._task_put('/api/block/mirroring/pool/{}'.format(pool),
+                             data)
+
+    @classmethod
+    def list_peers(cls, pool):
+        data = cls._get('/api/block/mirroring/pool/{}/peer'.format(pool))
+        if isinstance(data, list):
+            return data
+        return []
+
+    @classmethod
+    def get_peer(cls, pool, peer_uuid):
+        data = cls._get('/api/block/mirroring/pool/{}/peer/{}'.format(pool, peer_uuid))
+        if isinstance(data, dict):
+            return data
+        return {}
+
+    @classmethod
+    def create_peer(cls, pool, cluster_name, client_id, **kwargs):
+        data = {'cluster_name': cluster_name, 'client_id': client_id}
+        data.update(kwargs)
+        return cls._task_post('/api/block/mirroring/pool/{}/peer'.format(pool),
+                              data)
+
+    @classmethod
+    def update_peer(cls, pool, peer_uuid, **kwargs):
+        return cls._task_put('/api/block/mirroring/pool/{}/peer/{}'.format(pool, peer_uuid),
+                             kwargs)
+
+    @classmethod
+    def delete_peer(cls, pool, peer_uuid):
+        return cls._task_delete('/api/block/mirroring/pool/{}/peer/{}'.format(pool, peer_uuid))
+
+    @classmethod
+    def setUpClass(cls):
+        super(RbdMirroringTest, cls).setUpClass()
+        cls.create_pool('rbd', 2**3, 'replicated')
+
+    @classmethod
+    def tearDownClass(cls):
+        super(RbdMirroringTest, cls).tearDownClass()
+        cls._ceph_cmd(['osd', 'pool', 'delete', 'rbd', 'rbd', '--yes-i-really-really-mean-it'])
+
+    @DashboardTestCase.RunAs('test', 'test', [{'rbd-mirroring': ['create', 'update', 'delete']}])
+    def test_read_access_permissions(self):
+        self.get_pool('rbd')
+        self.assertStatus(403)
+        self.list_peers('rbd')
+        self.assertStatus(403)
+        self.get_peer('rbd', '123')
+        self.assertStatus(403)
+
+    @DashboardTestCase.RunAs('test', 'test', [{'rbd-mirroring': ['read', 'update', 'delete']}])
+    def test_create_access_permissions(self):
+        self.create_peer('rbd', 'remote', 'id')
+        self.assertStatus(403)
+
+    @DashboardTestCase.RunAs('test', 'test', [{'rbd-mirroring': ['read', 'create', 'delete']}])
+    def test_update_access_permissions(self):
+        self.update_peer('rbd', '123')
+        self.assertStatus(403)
+
+    @DashboardTestCase.RunAs('test', 'test', [{'rbd-mirroring': ['read', 'create', 'update']}])
+    def test_delete_access_permissions(self):
+        self.delete_peer('rbd', '123')
+        self.assertStatus(403)
+
+    def test_mirror_mode(self):
+        self.update_pool('rbd', 'disabled')
+        mode = self.get_pool('rbd').get('mirror_mode')
+        self.assertEqual(mode, 'disabled')
+
+        self.update_pool('rbd', 'image')
+        mode = self.get_pool('rbd').get('mirror_mode')
+        self.assertEqual(mode, 'image')
+
+        self.update_pool('rbd', 'pool')
+        mode = self.get_pool('rbd').get('mirror_mode')
+        self.assertEqual(mode, 'pool')
+
+        self.update_pool('rbd', 'disabled')
+        mode = self.get_pool('rbd').get('mirror_mode')
+        self.assertEqual(mode, 'disabled')
+
+    def test_set_invalid_mirror_mode(self):
+        self.update_pool('rbd', 'invalid')
+        self.assertStatus(400)
+
+    def test_set_same_mirror_mode(self):
+        self.update_pool('rbd', 'disabled')
+        self.update_pool('rbd', 'disabled')
+        self.assertStatus(200)
+
+    def test_peer(self):
+        self.update_pool('rbd', 'image')
+        self.assertStatus(200)
+
+        peers = self.list_peers('rbd')
+        self.assertStatus(200)
+        self.assertEqual([], peers)
+
+        uuid = self.create_peer('rbd', 'remote', 'admin')['uuid']
+        self.assertStatus(201)
+
+        peers = self.list_peers('rbd')
+        self.assertStatus(200)
+        self.assertEqual([uuid], peers)
+
+        expected_peer = {
+            'uuid': uuid,
+            'cluster_name': 'remote',
+            'site_name': 'remote',
+            'client_id': 'admin',
+            'mon_host': '',
+            'key': '',
+            'direction': 'rx-tx',
+            'mirror_uuid': ''
+        }
+        peer = self.get_peer('rbd', uuid)
+        self.assertEqual(expected_peer, peer)
+
+        self.update_peer('rbd', uuid, mon_host='1.2.3.4')
+        self.assertStatus(200)
+
+        expected_peer['mon_host'] = '1.2.3.4'
+        peer = self.get_peer('rbd', uuid)
+        self.assertEqual(expected_peer, peer)
+
+        self.delete_peer('rbd', uuid)
+        self.assertStatus(204)
+
+        self.update_pool('rbd', 'disabled')
+        self.assertStatus(200)
+
+    def test_disable_mirror_with_peers(self):
+        self.update_pool('rbd', 'image')
+        self.assertStatus(200)
+
+        uuid = self.create_peer('rbd', 'remote', 'admin')['uuid']
+        self.assertStatus(201)
+
+        self.update_pool('rbd', 'disabled')
+        self.assertStatus(400)
+
+        self.delete_peer('rbd', uuid)
+        self.assertStatus(204)
+
+        self.update_pool('rbd', 'disabled')
+        self.assertStatus(200)
+
+    def test_site_name(self):
+        expected_site_name = {'site_name': 'site-a'}
+        self._task_put('/api/block/mirroring/site_name', expected_site_name)
+        self.assertStatus(200)
+
+        site_name = self._get('/api/block/mirroring/site_name')
+        self.assertStatus(200)
+        self.assertEqual(expected_site_name, site_name)
+
+    def test_bootstrap(self):
+        self.update_pool('rbd', 'image')
+        token_data = self._task_post('/api/block/mirroring/pool/rbd/bootstrap/token', {})
+        self.assertStatus(200)
+
+        import_data = {
+            'token': token_data['token'],
+            'direction': 'invalid'}
+        self._task_post('/api/block/mirroring/pool/rbd/bootstrap/peer', import_data)
+        self.assertStatus(400)
+
+        # cannot import "youself" as peer
+        import_data['direction'] = 'rx'
+        self._task_post('/api/block/mirroring/pool/rbd/bootstrap/peer', import_data)
+        self.assertStatus(400)
diff --git a/qa/tasks/mgr/dashboard/test_requests.py b/qa/tasks/mgr/dashboard/test_requests.py
new file mode 100644
index 000000000..0d7fb75ad
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_requests.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import
+
+from . import DEFAULT_API_VERSION
+from .helper import DashboardTestCase
+
+
+class RequestsTest(DashboardTestCase):
+    def test_gzip(self):
+        self._get('/api/summary')
+        self.assertHeaders({
+            'Content-Encoding': 'gzip',
+            'Content-Type': 'application/vnd.ceph.api.v{}+json'.format(DEFAULT_API_VERSION)
+        })
+
+    def test_force_no_gzip(self):
+        self._get('/api/summary', params=dict(
+            headers={'Accept-Encoding': 'identity'}
+        ))
+        self.assertNotIn('Content-Encoding', self._resp.headers)
+        self.assertHeaders({
+            'Content-Type': 'application/json'
+        })
+
+    def test_server(self):
+        self._get('/api/summary')
+        self.assertHeaders({
+            'server': 'Ceph-Dashboard',
+            'Content-Type': 'application/vnd.ceph.api.v{}+json'.format(DEFAULT_API_VERSION),
+            'Content-Security-Policy': "frame-ancestors 'self';",
+            'X-Content-Type-Options': 'nosniff',
+            'Strict-Transport-Security': 'max-age=63072000; includeSubDomains; preload'
+        })
diff --git a/qa/tasks/mgr/dashboard/test_rgw.py b/qa/tasks/mgr/dashboard/test_rgw.py
new file mode 100644
index 000000000..53577a87a
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_rgw.py
@@ -0,0 +1,867 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+import base64
+import logging
+import time
+from urllib import parse
+
+from cryptography.hazmat.backends import default_backend
+from cryptography.hazmat.primitives.hashes import SHA1
+from cryptography.hazmat.primitives.twofactor.totp import TOTP
+
+from .helper import DashboardTestCase, JLeaf, JList, JObj
+
+logger = logging.getLogger(__name__)
+
+
+class RgwTestCase(DashboardTestCase):
+
+    maxDiff = None
+    create_test_user = False
+
+    AUTH_ROLES = ['rgw-manager']
+
+    @classmethod
+    def setUpClass(cls):
+        super(RgwTestCase, cls).setUpClass()
+        # Create the administrator account.
+        cls._radosgw_admin_cmd([
+            'user', 'create', '--uid', 'admin', '--display-name', 'admin',
+            '--system', '--access-key', 'admin', '--secret', 'admin'
+        ])
+        # Update the dashboard configuration.
+        cls._ceph_cmd_with_secret(['dashboard', 'set-rgw-api-secret-key'], 'admin')
+        cls._ceph_cmd_with_secret(['dashboard', 'set-rgw-api-access-key'], 'admin')
+        # Create a test user?
+        if cls.create_test_user:
+            cls._radosgw_admin_cmd([
+                'user', 'create', '--uid', 'teuth-test-user', '--display-name',
+                'teuth-test-user'
+            ])
+            cls._radosgw_admin_cmd([
+                'caps', 'add', '--uid', 'teuth-test-user', '--caps',
+                'metadata=write'
+            ])
+            cls._radosgw_admin_cmd([
+                'subuser', 'create', '--uid', 'teuth-test-user', '--subuser',
+                'teuth-test-subuser', '--access', 'full', '--key-type', 's3',
+                '--access-key', 'xyz123'
+            ])
+            cls._radosgw_admin_cmd([
+                'subuser', 'create', '--uid', 'teuth-test-user', '--subuser',
+                'teuth-test-subuser2', '--access', 'full', '--key-type',
+                'swift'
+            ])
+
+    @classmethod
+    def tearDownClass(cls):
+        # Delete administrator account.
+        cls._radosgw_admin_cmd(['user', 'rm', '--uid', 'admin'])
+        if cls.create_test_user:
+            cls._radosgw_admin_cmd(['user', 'rm', '--uid=teuth-test-user', '--purge-data'])
+        super(RgwTestCase, cls).tearDownClass()
+
+    def get_rgw_user(self, uid, stats=True):
+        return self._get('/api/rgw/user/{}?stats={}'.format(uid, stats))
+
+
+class RgwApiCredentialsTest(RgwTestCase):
+
+    AUTH_ROLES = ['rgw-manager']
+
+    def test_invalid_credentials(self):
+        self._ceph_cmd_with_secret(['dashboard', 'set-rgw-api-secret-key'], 'invalid')
+        self._ceph_cmd_with_secret(['dashboard', 'set-rgw-api-access-key'], 'invalid')
+        resp = self._get('/api/rgw/user')
+        self.assertStatus(404)
+        self.assertIn('detail', resp)
+        self.assertIn('component', resp)
+        self.assertIn('Error connecting to Object Gateway', resp['detail'])
+        self.assertEqual(resp['component'], 'rgw')
+
+    def test_success(self):
+        # Set the default credentials.
+        self._ceph_cmd_with_secret(['dashboard', 'set-rgw-api-secret-key'], 'admin')
+        self._ceph_cmd_with_secret(['dashboard', 'set-rgw-api-access-key'], 'admin')
+        data = self._get('/ui-api/rgw/status')
+        self.assertStatus(200)
+        self.assertIn('available', data)
+        self.assertIn('message', data)
+        self.assertTrue(data['available'])
+
+
+class RgwSiteTest(RgwTestCase):
+
+    AUTH_ROLES = ['rgw-manager']
+
+    def test_get_placement_targets(self):
+        data = self._get('/api/rgw/site?query=placement-targets')
+        self.assertStatus(200)
+        self.assertSchema(data, JObj({
+            'zonegroup': str,
+            'placement_targets': JList(JObj({
+                'name': str,
+                'data_pool': str
+            }))
+        }))
+
+    def test_get_realms(self):
+        data = self._get('/api/rgw/site?query=realms')
+        self.assertStatus(200)
+        self.assertSchema(data, JList(str))
+
+
+class RgwBucketTest(RgwTestCase):
+
+    _mfa_token_serial = '1'
+    _mfa_token_seed = '23456723'
+    _mfa_token_time_step = 2
+
+    AUTH_ROLES = ['rgw-manager']
+
+    @classmethod
+    def setUpClass(cls):
+        cls.create_test_user = True
+        super(RgwBucketTest, cls).setUpClass()
+        # Create MFA TOTP token for test user.
+        cls._radosgw_admin_cmd([
+            'mfa', 'create', '--uid', 'teuth-test-user', '--totp-serial', cls._mfa_token_serial,
+            '--totp-seed', cls._mfa_token_seed, '--totp-seed-type', 'base32',
+            '--totp-seconds', str(cls._mfa_token_time_step), '--totp-window', '1'
+        ])
+        # Create tenanted users.
+        cls._radosgw_admin_cmd([
+            'user', 'create', '--tenant', 'testx', '--uid', 'teuth-test-user',
+            '--display-name', 'tenanted teuth-test-user'
+        ])
+        cls._radosgw_admin_cmd([
+            'user', 'create', '--tenant', 'testx2', '--uid', 'teuth-test-user2',
+            '--display-name', 'tenanted teuth-test-user 2'
+        ])
+
+    @classmethod
+    def tearDownClass(cls):
+        cls._radosgw_admin_cmd(
+            ['user', 'rm', '--tenant', 'testx', '--uid=teuth-test-user', '--purge-data'])
+        cls._radosgw_admin_cmd(
+            ['user', 'rm', '--tenant', 'testx2', '--uid=teuth-test-user2', '--purge-data'])
+        super(RgwBucketTest, cls).tearDownClass()
+
+    def _get_mfa_token_pin(self):
+        totp_key = base64.b32decode(self._mfa_token_seed)
+        totp = TOTP(totp_key, 6, SHA1(), self._mfa_token_time_step, backend=default_backend(),
+                    enforce_key_length=False)
+        time_value = int(time.time())
+        return totp.generate(time_value)
+
+    def test_all(self):
+        # Create a new bucket.
+        self._post(
+            '/api/rgw/bucket',
+            params={
+                'bucket': 'teuth-test-bucket',
+                'uid': 'admin',
+                'zonegroup': 'default',
+                'placement_target': 'default-placement'
+            })
+        self.assertStatus(201)
+        data = self.jsonBody()
+        self.assertSchema(data, JObj(sub_elems={
+            'bucket_info': JObj(sub_elems={
+                'bucket': JObj(allow_unknown=True, sub_elems={
+                    'name': JLeaf(str),
+                    'bucket_id': JLeaf(str),
+                    'tenant': JLeaf(str)
+                }),
+                'quota': JObj(sub_elems={}, allow_unknown=True),
+                'creation_time': JLeaf(str)
+            }, allow_unknown=True)
+        }, allow_unknown=True))
+        data = data['bucket_info']['bucket']
+        self.assertEqual(data['name'], 'teuth-test-bucket')
+        self.assertEqual(data['tenant'], '')
+
+        # List all buckets.
+        data = self._get('/api/rgw/bucket', version='1.1')
+        self.assertStatus(200)
+        self.assertEqual(len(data), 1)
+        self.assertIn('teuth-test-bucket', data)
+
+        # List all buckets with stats.
+        data = self._get('/api/rgw/bucket?stats=true', version='1.1')
+        self.assertStatus(200)
+        self.assertEqual(len(data), 1)
+        self.assertSchema(data[0], JObj(sub_elems={
+            'bid': JLeaf(str),
+            'bucket': JLeaf(str),
+            'bucket_quota': JObj(sub_elems={}, allow_unknown=True),
+            'id': JLeaf(str),
+            'owner': JLeaf(str),
+            'usage': JObj(sub_elems={}, allow_unknown=True),
+            'tenant': JLeaf(str),
+        }, allow_unknown=True))
+
+        # List all buckets names without stats.
+        data = self._get('/api/rgw/bucket?stats=false', version='1.1')
+        self.assertStatus(200)
+        self.assertEqual(data, ['teuth-test-bucket'])
+
+        # Get the bucket.
+        data = self._get('/api/rgw/bucket/teuth-test-bucket')
+        self.assertStatus(200)
+        self.assertSchema(data, JObj(sub_elems={
+            'id': JLeaf(str),
+            'bid': JLeaf(str),
+            'tenant': JLeaf(str),
+            'bucket': JLeaf(str),
+            'bucket_quota': JObj(sub_elems={}, allow_unknown=True),
+            'owner': JLeaf(str),
+            'mfa_delete': JLeaf(str),
+            'usage': JObj(sub_elems={}, allow_unknown=True),
+            'versioning': JLeaf(str)
+        }, allow_unknown=True))
+        self.assertEqual(data['bucket'], 'teuth-test-bucket')
+        self.assertEqual(data['owner'], 'admin')
+        self.assertEqual(data['placement_rule'], 'default-placement')
+        self.assertEqual(data['versioning'], 'Suspended')
+
+        # Update bucket: change owner, enable versioning.
+        self._put(
+            '/api/rgw/bucket/teuth-test-bucket',
+            params={
+                'bucket_id': data['id'],
+                'uid': 'teuth-test-user',
+                'versioning_state': 'Enabled'
+            })
+        self.assertStatus(200)
+        data = self._get('/api/rgw/bucket/teuth-test-bucket')
+        self.assertStatus(200)
+        self.assertSchema(data, JObj(sub_elems={
+            'owner': JLeaf(str),
+            'bid': JLeaf(str),
+            'tenant': JLeaf(str)
+        }, allow_unknown=True))
+        self.assertEqual(data['owner'], 'teuth-test-user')
+        self.assertEqual(data['versioning'], 'Enabled')
+
+        # Update bucket: enable MFA Delete.
+        self._put(
+            '/api/rgw/bucket/teuth-test-bucket',
+            params={
+                'bucket_id': data['id'],
+                'uid': 'teuth-test-user',
+                'versioning_state': 'Enabled',
+                'mfa_delete': 'Enabled',
+                'mfa_token_serial': self._mfa_token_serial,
+                'mfa_token_pin': self._get_mfa_token_pin()
+            })
+        self.assertStatus(200)
+        data = self._get('/api/rgw/bucket/teuth-test-bucket')
+        self.assertStatus(200)
+        self.assertEqual(data['versioning'], 'Enabled')
+        self.assertEqual(data['mfa_delete'], 'Enabled')
+
+        # Update bucket: disable versioning & MFA Delete.
+        time.sleep(self._mfa_token_time_step * 3)  # Required to get new TOTP pin.
+        self._put(
+            '/api/rgw/bucket/teuth-test-bucket',
+            params={
+                'bucket_id': data['id'],
+                'uid': 'teuth-test-user',
+                'versioning_state': 'Suspended',
+                'mfa_delete': 'Disabled',
+                'mfa_token_serial': self._mfa_token_serial,
+                'mfa_token_pin': self._get_mfa_token_pin()
+            })
+        self.assertStatus(200)
+        data = self._get('/api/rgw/bucket/teuth-test-bucket')
+        self.assertStatus(200)
+        self.assertEqual(data['versioning'], 'Suspended')
+        self.assertEqual(data['mfa_delete'], 'Disabled')
+
+        # Delete the bucket.
+        self._delete('/api/rgw/bucket/teuth-test-bucket')
+        self.assertStatus(204)
+        data = self._get('/api/rgw/bucket', version='1.1')
+        self.assertStatus(200)
+        self.assertEqual(len(data), 0)
+
+    def test_crud_w_tenant(self):
+        # Create a new bucket. The tenant of the user is used when
+        # the bucket is created.
+        self._post(
+            '/api/rgw/bucket',
+            params={
+                'bucket': 'teuth-test-bucket',
+                'uid': 'testx$teuth-test-user',
+                'zonegroup': 'default',
+                'placement_target': 'default-placement'
+            })
+        self.assertStatus(201)
+        # It's not possible to validate the result because there
+        # IS NO result object returned by the RGW Admin OPS API
+        # when a tenanted bucket is created.
+        data = self.jsonBody()
+        self.assertIsNone(data)
+
+        # List all buckets.
+        data = self._get('/api/rgw/bucket', version='1.1')
+        self.assertStatus(200)
+        self.assertEqual(len(data), 1)
+        self.assertIn('testx/teuth-test-bucket', data)
+
+        def _verify_tenant_bucket(bucket, tenant, uid):
+            full_bucket_name = '{}/{}'.format(tenant, bucket)
+            _data = self._get('/api/rgw/bucket/{}'.format(
+                parse.quote_plus(full_bucket_name)))
+            self.assertStatus(200)
+            self.assertSchema(_data, JObj(sub_elems={
+                'owner': JLeaf(str),
+                'bucket': JLeaf(str),
+                'tenant': JLeaf(str),
+                'bid': JLeaf(str)
+            }, allow_unknown=True))
+            self.assertEqual(_data['owner'], '{}${}'.format(tenant, uid))
+            self.assertEqual(_data['bucket'], bucket)
+            self.assertEqual(_data['tenant'], tenant)
+            self.assertEqual(_data['bid'], full_bucket_name)
+            return _data
+
+        # Get the bucket.
+        data = _verify_tenant_bucket('teuth-test-bucket', 'testx', 'teuth-test-user')
+        self.assertEqual(data['placement_rule'], 'default-placement')
+        self.assertEqual(data['versioning'], 'Suspended')
+
+        # Update bucket: different user with different tenant, enable versioning.
+        self._put(
+            '/api/rgw/bucket/{}'.format(
+                parse.quote_plus('testx/teuth-test-bucket')),
+            params={
+                'bucket_id': data['id'],
+                'uid': 'testx2$teuth-test-user2',
+                'versioning_state': 'Enabled'
+            })
+        data = _verify_tenant_bucket('teuth-test-bucket', 'testx2', 'teuth-test-user2')
+        self.assertEqual(data['versioning'], 'Enabled')
+
+        # Change owner to a non-tenanted user
+        self._put(
+            '/api/rgw/bucket/{}'.format(
+                parse.quote_plus('testx2/teuth-test-bucket')),
+            params={
+                'bucket_id': data['id'],
+                'uid': 'admin'
+            })
+        self.assertStatus(200)
+        data = self._get('/api/rgw/bucket/teuth-test-bucket')
+        self.assertStatus(200)
+        self.assertIn('owner', data)
+        self.assertEqual(data['owner'], 'admin')
+        self.assertEqual(data['tenant'], '')
+        self.assertEqual(data['bucket'], 'teuth-test-bucket')
+        self.assertEqual(data['bid'], 'teuth-test-bucket')
+        self.assertEqual(data['versioning'], 'Enabled')
+
+        # Change owner back to tenanted user, suspend versioning.
+        self._put(
+            '/api/rgw/bucket/teuth-test-bucket',
+            params={
+                'bucket_id': data['id'],
+                'uid': 'testx$teuth-test-user',
+                'versioning_state': 'Suspended'
+            })
+        self.assertStatus(200)
+        data = _verify_tenant_bucket('teuth-test-bucket', 'testx', 'teuth-test-user')
+        self.assertEqual(data['versioning'], 'Suspended')
+
+        # Delete the bucket.
+        self._delete('/api/rgw/bucket/{}'.format(
+            parse.quote_plus('testx/teuth-test-bucket')))
+        self.assertStatus(204)
+        data = self._get('/api/rgw/bucket', version='1.1')
+        self.assertStatus(200)
+        self.assertEqual(len(data), 0)
+
+    def test_crud_w_locking(self):
+        # Create
+        self._post('/api/rgw/bucket',
+                   params={
+                       'bucket': 'teuth-test-bucket',
+                       'uid': 'teuth-test-user',
+                       'zonegroup': 'default',
+                       'placement_target': 'default-placement',
+                       'lock_enabled': 'true',
+                       'lock_mode': 'GOVERNANCE',
+                       'lock_retention_period_days': '0',
+                       'lock_retention_period_years': '1'
+                   })
+        self.assertStatus(201)
+        # Read
+        data = self._get('/api/rgw/bucket/teuth-test-bucket')
+        self.assertStatus(200)
+        self.assertSchema(
+            data,
+            JObj(sub_elems={
+                'lock_enabled': JLeaf(bool),
+                'lock_mode': JLeaf(str),
+                'lock_retention_period_days': JLeaf(int),
+                'lock_retention_period_years': JLeaf(int)
+            },
+                allow_unknown=True))
+        self.assertTrue(data['lock_enabled'])
+        self.assertEqual(data['lock_mode'], 'GOVERNANCE')
+        self.assertEqual(data['lock_retention_period_days'], 0)
+        self.assertEqual(data['lock_retention_period_years'], 1)
+        # Update
+        self._put('/api/rgw/bucket/teuth-test-bucket',
+                  params={
+                      'bucket_id': data['id'],
+                      'uid': 'teuth-test-user',
+                      'lock_mode': 'COMPLIANCE',
+                      'lock_retention_period_days': '15',
+                      'lock_retention_period_years': '0'
+                  })
+        self.assertStatus(200)
+        data = self._get('/api/rgw/bucket/teuth-test-bucket')
+        self.assertTrue(data['lock_enabled'])
+        self.assertEqual(data['lock_mode'], 'COMPLIANCE')
+        self.assertEqual(data['lock_retention_period_days'], 15)
+        self.assertEqual(data['lock_retention_period_years'], 0)
+        self.assertStatus(200)
+
+        # Update: Disabling bucket versioning should fail if object locking enabled
+        self._put('/api/rgw/bucket/teuth-test-bucket',
+                  params={
+                      'bucket_id': data['id'],
+                      'uid': 'teuth-test-user',
+                      'versioning_state': 'Suspended'
+                  })
+        self.assertStatus(409)
+
+        # Delete
+        self._delete('/api/rgw/bucket/teuth-test-bucket')
+        self.assertStatus(204)
+
+
+class RgwDaemonTest(RgwTestCase):
+
+    AUTH_ROLES = ['rgw-manager']
+
+    @DashboardTestCase.RunAs('test', 'test', [{
+        'rgw': ['create', 'update', 'delete']
+    }])
+    def test_read_access_permissions(self):
+        self._get('/api/rgw/daemon')
+        self.assertStatus(403)
+        self._get('/api/rgw/daemon/id')
+        self.assertStatus(403)
+
+    def test_list(self):
+        data = self._get('/api/rgw/daemon')
+        self.assertStatus(200)
+        self.assertEqual(len(data), 1)
+        data = data[0]
+        self.assertIn('id', data)
+        self.assertIn('version', data)
+        self.assertIn('server_hostname', data)
+        self.assertIn('zonegroup_name', data)
+        self.assertIn('zone_name', data)
+
+    def test_get(self):
+        data = self._get('/api/rgw/daemon')
+        self.assertStatus(200)
+
+        data = self._get('/api/rgw/daemon/{}'.format(data[0]['id']))
+        self.assertStatus(200)
+        self.assertIn('rgw_metadata', data)
+        self.assertIn('rgw_id', data)
+        self.assertIn('rgw_status', data)
+        self.assertTrue(data['rgw_metadata'])
+
+    def test_status(self):
+        data = self._get('/ui-api/rgw/status')
+        self.assertStatus(200)
+        self.assertIn('available', data)
+        self.assertIn('message', data)
+        self.assertTrue(data['available'])
+
+
+class RgwUserTest(RgwTestCase):
+
+    AUTH_ROLES = ['rgw-manager']
+
+    @classmethod
+    def setUpClass(cls):
+        super(RgwUserTest, cls).setUpClass()
+
+    def _assert_user_data(self, data):
+        self.assertSchema(data, JObj(sub_elems={
+            'caps': JList(JObj(sub_elems={}, allow_unknown=True)),
+            'display_name': JLeaf(str),
+            'email': JLeaf(str),
+            'keys': JList(JObj(sub_elems={}, allow_unknown=True)),
+            'max_buckets': JLeaf(int),
+            'subusers': JList(JLeaf(str)),
+            'suspended': JLeaf(int),
+            'swift_keys': JList(JObj(sub_elems={}, allow_unknown=True)),
+            'tenant': JLeaf(str),
+            'user_id': JLeaf(str),
+            'uid': JLeaf(str)
+        }, allow_unknown=True))
+        self.assertGreaterEqual(len(data['keys']), 1)
+
+    def test_get(self):
+        data = self.get_rgw_user('admin')
+        self.assertStatus(200)
+        self._assert_user_data(data)
+        self.assertEqual(data['user_id'], 'admin')
+        self.assertTrue(data['stats'])
+        self.assertIsInstance(data['stats'], dict)
+        # Test without stats.
+        data = self.get_rgw_user('admin', False)
+        self.assertStatus(200)
+        self._assert_user_data(data)
+        self.assertEqual(data['user_id'], 'admin')
+
+    def test_list(self):
+        data = self._get('/api/rgw/user')
+        self.assertStatus(200)
+        self.assertGreaterEqual(len(data), 1)
+        self.assertIn('admin', data)
+
+    def test_get_emails(self):
+        data = self._get('/api/rgw/user/get_emails')
+        self.assertStatus(200)
+        self.assertSchema(data, JList(str))
+
+    def test_create_get_update_delete(self):
+        # Create a new user.
+        self._post('/api/rgw/user', params={
+            'uid': 'teuth-test-user',
+            'display_name': 'display name'
+        })
+        self.assertStatus(201)
+        data = self.jsonBody()
+        self._assert_user_data(data)
+        self.assertEqual(data['user_id'], 'teuth-test-user')
+        self.assertEqual(data['display_name'], 'display name')
+
+        # Get the user.
+        data = self.get_rgw_user('teuth-test-user')
+        self.assertStatus(200)
+        self._assert_user_data(data)
+        self.assertEqual(data['tenant'], '')
+        self.assertEqual(data['user_id'], 'teuth-test-user')
+        self.assertEqual(data['uid'], 'teuth-test-user')
+
+        # Update the user.
+        self._put(
+            '/api/rgw/user/teuth-test-user',
+            params={'display_name': 'new name'})
+        self.assertStatus(200)
+        data = self.jsonBody()
+        self._assert_user_data(data)
+        self.assertEqual(data['display_name'], 'new name')
+
+        # Delete the user.
+        self._delete('/api/rgw/user/teuth-test-user')
+        self.assertStatus(204)
+        self.get_rgw_user('teuth-test-user')
+        self.assertStatus(500)
+        resp = self.jsonBody()
+        self.assertIn('detail', resp)
+        self.assertIn('failed request with status code 404', resp['detail'])
+        self.assertIn('"Code":"NoSuchUser"', resp['detail'])
+        self.assertIn('"HostId"', resp['detail'])
+        self.assertIn('"RequestId"', resp['detail'])
+
+    def test_create_get_update_delete_w_tenant(self):
+        # Create a new user.
+        self._post(
+            '/api/rgw/user',
+            params={
+                'uid': 'test01$teuth-test-user',
+                'display_name': 'display name'
+            })
+        self.assertStatus(201)
+        data = self.jsonBody()
+        self._assert_user_data(data)
+        self.assertEqual(data['user_id'], 'teuth-test-user')
+        self.assertEqual(data['display_name'], 'display name')
+
+        # Get the user.
+        data = self.get_rgw_user('test01$teuth-test-user')
+        self.assertStatus(200)
+        self._assert_user_data(data)
+        self.assertEqual(data['tenant'], 'test01')
+        self.assertEqual(data['user_id'], 'teuth-test-user')
+        self.assertEqual(data['uid'], 'test01$teuth-test-user')
+
+        # Update the user.
+        self._put(
+            '/api/rgw/user/test01$teuth-test-user',
+            params={'display_name': 'new name'})
+        self.assertStatus(200)
+        data = self.jsonBody()
+        self._assert_user_data(data)
+        self.assertEqual(data['display_name'], 'new name')
+
+        # Delete the user.
+        self._delete('/api/rgw/user/test01$teuth-test-user')
+        self.assertStatus(204)
+        self.get_rgw_user('test01$teuth-test-user')
+        self.assertStatus(500)
+        resp = self.jsonBody()
+        self.assertIn('detail', resp)
+        self.assertIn('failed request with status code 404', resp['detail'])
+        self.assertIn('"Code":"NoSuchUser"', resp['detail'])
+        self.assertIn('"HostId"', resp['detail'])
+        self.assertIn('"RequestId"', resp['detail'])
+
+
+class RgwUserCapabilityTest(RgwTestCase):
+
+    AUTH_ROLES = ['rgw-manager']
+
+    @classmethod
+    def setUpClass(cls):
+        cls.create_test_user = True
+        super(RgwUserCapabilityTest, cls).setUpClass()
+
+    def test_set(self):
+        self._post(
+            '/api/rgw/user/teuth-test-user/capability',
+            params={
+                'type': 'usage',
+                'perm': 'read'
+            })
+        self.assertStatus(201)
+        data = self.jsonBody()
+        self.assertEqual(len(data), 1)
+        data = data[0]
+        self.assertEqual(data['type'], 'usage')
+        self.assertEqual(data['perm'], 'read')
+
+        # Get the user data to validate the capabilities.
+        data = self.get_rgw_user('teuth-test-user')
+        self.assertStatus(200)
+        self.assertGreaterEqual(len(data['caps']), 1)
+        self.assertEqual(data['caps'][0]['type'], 'usage')
+        self.assertEqual(data['caps'][0]['perm'], 'read')
+
+    def test_delete(self):
+        self._delete(
+            '/api/rgw/user/teuth-test-user/capability',
+            params={
+                'type': 'metadata',
+                'perm': 'write'
+            })
+        self.assertStatus(204)
+
+        # Get the user data to validate the capabilities.
+        data = self.get_rgw_user('teuth-test-user')
+        self.assertStatus(200)
+        self.assertEqual(len(data['caps']), 0)
+
+
+class RgwUserKeyTest(RgwTestCase):
+
+    AUTH_ROLES = ['rgw-manager']
+
+    @classmethod
+    def setUpClass(cls):
+        cls.create_test_user = True
+        super(RgwUserKeyTest, cls).setUpClass()
+
+    def test_create_s3(self):
+        self._post(
+            '/api/rgw/user/teuth-test-user/key',
+            params={
+                'key_type': 's3',
+                'generate_key': 'false',
+                'access_key': 'abc987',
+                'secret_key': 'aaabbbccc'
+            })
+        data = self.jsonBody()
+        self.assertStatus(201)
+        self.assertGreaterEqual(len(data), 3)
+        key = self.find_object_in_list('access_key', 'abc987', data)
+        self.assertIsInstance(key, object)
+        self.assertEqual(key['secret_key'], 'aaabbbccc')
+
+    def test_create_swift(self):
+        self._post(
+            '/api/rgw/user/teuth-test-user/key',
+            params={
+                'key_type': 'swift',
+                'subuser': 'teuth-test-subuser',
+                'generate_key': 'false',
+                'secret_key': 'xxxyyyzzz'
+            })
+        data = self.jsonBody()
+        self.assertStatus(201)
+        self.assertGreaterEqual(len(data), 2)
+        key = self.find_object_in_list('secret_key', 'xxxyyyzzz', data)
+        self.assertIsInstance(key, object)
+
+    def test_delete_s3(self):
+        self._delete(
+            '/api/rgw/user/teuth-test-user/key',
+            params={
+                'key_type': 's3',
+                'access_key': 'xyz123'
+            })
+        self.assertStatus(204)
+
+    def test_delete_swift(self):
+        self._delete(
+            '/api/rgw/user/teuth-test-user/key',
+            params={
+                'key_type': 'swift',
+                'subuser': 'teuth-test-user:teuth-test-subuser2'
+            })
+        self.assertStatus(204)
+
+
+class RgwUserQuotaTest(RgwTestCase):
+
+    AUTH_ROLES = ['rgw-manager']
+
+    @classmethod
+    def setUpClass(cls):
+        cls.create_test_user = True
+        super(RgwUserQuotaTest, cls).setUpClass()
+
+    def _assert_quota(self, data):
+        self.assertIn('user_quota', data)
+        self.assertIn('max_objects', data['user_quota'])
+        self.assertIn('enabled', data['user_quota'])
+        self.assertIn('max_size_kb', data['user_quota'])
+        self.assertIn('max_size', data['user_quota'])
+        self.assertIn('bucket_quota', data)
+        self.assertIn('max_objects', data['bucket_quota'])
+        self.assertIn('enabled', data['bucket_quota'])
+        self.assertIn('max_size_kb', data['bucket_quota'])
+        self.assertIn('max_size', data['bucket_quota'])
+
+    def test_get_quota(self):
+        data = self._get('/api/rgw/user/teuth-test-user/quota')
+        self.assertStatus(200)
+        self._assert_quota(data)
+
+    def test_set_user_quota(self):
+        self._put(
+            '/api/rgw/user/teuth-test-user/quota',
+            params={
+                'quota_type': 'user',
+                'enabled': 'true',
+                'max_size_kb': 2048,
+                'max_objects': 101
+            })
+        self.assertStatus(200)
+
+        data = self._get('/api/rgw/user/teuth-test-user/quota')
+        self.assertStatus(200)
+        self._assert_quota(data)
+        self.assertEqual(data['user_quota']['max_objects'], 101)
+        self.assertTrue(data['user_quota']['enabled'])
+        self.assertEqual(data['user_quota']['max_size_kb'], 2048)
+
+    def test_set_bucket_quota(self):
+        self._put(
+            '/api/rgw/user/teuth-test-user/quota',
+            params={
+                'quota_type': 'bucket',
+                'enabled': 'false',
+                'max_size_kb': 4096,
+                'max_objects': 2000
+            })
+        self.assertStatus(200)
+
+        data = self._get('/api/rgw/user/teuth-test-user/quota')
+        self.assertStatus(200)
+        self._assert_quota(data)
+        self.assertEqual(data['bucket_quota']['max_objects'], 2000)
+        self.assertFalse(data['bucket_quota']['enabled'])
+        self.assertEqual(data['bucket_quota']['max_size_kb'], 4096)
+
+
+class RgwUserSubuserTest(RgwTestCase):
+
+    AUTH_ROLES = ['rgw-manager']
+
+    @classmethod
+    def setUpClass(cls):
+        cls.create_test_user = True
+        super(RgwUserSubuserTest, cls).setUpClass()
+
+    def test_create_swift(self):
+        self._post(
+            '/api/rgw/user/teuth-test-user/subuser',
+            params={
+                'subuser': 'tux',
+                'access': 'readwrite',
+                'key_type': 'swift'
+            })
+        self.assertStatus(201)
+        data = self.jsonBody()
+        subuser = self.find_object_in_list('id', 'teuth-test-user:tux', data)
+        self.assertIsInstance(subuser, object)
+        self.assertEqual(subuser['permissions'], 'read-write')
+
+        # Get the user data to validate the keys.
+        data = self.get_rgw_user('teuth-test-user')
+        self.assertStatus(200)
+        key = self.find_object_in_list('user', 'teuth-test-user:tux',
+                                       data['swift_keys'])
+        self.assertIsInstance(key, object)
+
+    def test_create_s3(self):
+        self._post(
+            '/api/rgw/user/teuth-test-user/subuser',
+            params={
+                'subuser': 'hugo',
+                'access': 'write',
+                'generate_secret': 'false',
+                'access_key': 'yyy',
+                'secret_key': 'xxx'
+            })
+        self.assertStatus(201)
+        data = self.jsonBody()
+        subuser = self.find_object_in_list('id', 'teuth-test-user:hugo', data)
+        self.assertIsInstance(subuser, object)
+        self.assertEqual(subuser['permissions'], 'write')
+
+        # Get the user data to validate the keys.
+        data = self.get_rgw_user('teuth-test-user')
+        self.assertStatus(200)
+        key = self.find_object_in_list('user', 'teuth-test-user:hugo',
+                                       data['keys'])
+        self.assertIsInstance(key, object)
+        self.assertEqual(key['secret_key'], 'xxx')
+
+    def test_delete_w_purge(self):
+        self._delete(
+            '/api/rgw/user/teuth-test-user/subuser/teuth-test-subuser2')
+        self.assertStatus(204)
+
+        # Get the user data to check that the keys don't exist anymore.
+        data = self.get_rgw_user('teuth-test-user')
+        self.assertStatus(200)
+        key = self.find_object_in_list(
+            'user', 'teuth-test-user:teuth-test-subuser2', data['swift_keys'])
+        self.assertIsNone(key)
+
+    def test_delete_wo_purge(self):
+        self._delete(
+            '/api/rgw/user/teuth-test-user/subuser/teuth-test-subuser',
+            params={'purge_keys': 'false'})
+        self.assertStatus(204)
+
+        # Get the user data to check whether they keys still exist.
+        data = self.get_rgw_user('teuth-test-user')
+        self.assertStatus(200)
+        key = self.find_object_in_list(
+            'user', 'teuth-test-user:teuth-test-subuser', data['keys'])
+        self.assertIsInstance(key, object)
diff --git a/qa/tasks/mgr/dashboard/test_role.py b/qa/tasks/mgr/dashboard/test_role.py
new file mode 100644
index 000000000..dbfaea9e4
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_role.py
@@ -0,0 +1,145 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import
+
+from .helper import DashboardTestCase
+
+
+class RoleTest(DashboardTestCase):
+    @classmethod
+    def _create_role(cls, name=None, description=None, scopes_permissions=None):
+        data = {}
+        if name:
+            data['name'] = name
+        if description:
+            data['description'] = description
+        if scopes_permissions:
+            data['scopes_permissions'] = scopes_permissions
+        cls._post('/api/role', data)
+
+    def test_crud_role(self):
+        self._create_role(name='role1',
+                          description='Description 1',
+                          scopes_permissions={'osd': ['read']})
+        self.assertStatus(201)
+        self.assertJsonBody({
+            'name': 'role1',
+            'description': 'Description 1',
+            'scopes_permissions': {'osd': ['read']},
+            'system': False
+        })
+
+        self._get('/api/role/role1')
+        self.assertStatus(200)
+        self.assertJsonBody({
+            'name': 'role1',
+            'description': 'Description 1',
+            'scopes_permissions': {'osd': ['read']},
+            'system': False
+        })
+
+        self._put('/api/role/role1', {
+            'description': 'Description 2',
+            'scopes_permissions': {'osd': ['read', 'update']},
+        })
+        self.assertStatus(200)
+        self.assertJsonBody({
+            'name': 'role1',
+            'description': 'Description 2',
+            'scopes_permissions': {'osd': ['read', 'update']},
+            'system': False
+        })
+
+        self._delete('/api/role/role1')
+        self.assertStatus(204)
+
+    def test_list_roles(self):
+        roles = self._get('/api/role')
+        self.assertStatus(200)
+
+        self.assertGreaterEqual(len(roles), 1)
+        for role in roles:
+            self.assertIn('name', role)
+            self.assertIn('description', role)
+            self.assertIn('scopes_permissions', role)
+            self.assertIn('system', role)
+
+    def test_get_role_does_not_exist(self):
+        self._get('/api/role/role2')
+        self.assertStatus(404)
+
+    def test_create_role_already_exists(self):
+        self._create_role(name='read-only',
+                          description='Description 1',
+                          scopes_permissions={'osd': ['read']})
+        self.assertStatus(400)
+        self.assertError(code='role_already_exists',
+                         component='role')
+
+    def test_create_role_no_name(self):
+        self._create_role(description='Description 1',
+                          scopes_permissions={'osd': ['read']})
+        self.assertStatus(400)
+        self.assertError(code='name_required',
+                         component='role')
+
+    def test_create_role_invalid_scope(self):
+        self._create_role(name='role1',
+                          description='Description 1',
+                          scopes_permissions={'invalid-scope': ['read']})
+        self.assertStatus(400)
+        self.assertError(code='invalid_scope',
+                         component='role')
+
+    def test_create_role_invalid_permission(self):
+        self._create_role(name='role1',
+                          description='Description 1',
+                          scopes_permissions={'osd': ['invalid-permission']})
+        self.assertStatus(400)
+        self.assertError(code='invalid_permission',
+                         component='role')
+
+    def test_delete_role_does_not_exist(self):
+        self._delete('/api/role/role2')
+        self.assertStatus(404)
+
+    def test_delete_system_role(self):
+        self._delete('/api/role/read-only')
+        self.assertStatus(400)
+        self.assertError(code='cannot_delete_system_role',
+                         component='role')
+
+    def test_delete_role_associated_with_user(self):
+        self.create_user("user", "user", ['read-only'])
+        self._create_role(name='role1',
+                          description='Description 1',
+                          scopes_permissions={'user': ['create', 'read', 'update', 'delete']})
+        self.assertStatus(201)
+        self._put('/api/user/user', {'roles': ['role1']})
+        self.assertStatus(200)
+
+        self._delete('/api/role/role1')
+        self.assertStatus(400)
+        self.assertError(code='role_is_associated_with_user',
+                         component='role')
+
+        self._put('/api/user/user', {'roles': ['administrator']})
+        self.assertStatus(200)
+        self._delete('/api/role/role1')
+        self.assertStatus(204)
+        self.delete_user("user")
+
+    def test_update_role_does_not_exist(self):
+        self._put('/api/role/role2', {})
+        self.assertStatus(404)
+
+    def test_update_system_role(self):
+        self._put('/api/role/read-only', {})
+        self.assertStatus(400)
+        self.assertError(code='cannot_update_system_role',
+                         component='role')
+
+    def test_clone_role(self):
+        self._post('/api/role/read-only/clone', {'new_name': 'foo'})
+        self.assertStatus(201)
+        self._delete('/api/role/foo')
diff --git a/qa/tasks/mgr/dashboard/test_settings.py b/qa/tasks/mgr/dashboard/test_settings.py
new file mode 100644
index 000000000..d6ad1e762
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_settings.py
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import
+
+from .helper import DashboardTestCase, JAny, JList, JObj
+
+
+class SettingsTest(DashboardTestCase):
+    def setUp(self):
+        super(SettingsTest, self).setUp()
+        self.settings = self._get('/api/settings')
+
+    def tearDown(self):
+        self._put(
+            '/api/settings',
+            {setting['name']: setting['value']
+             for setting in self.settings})
+
+    def test_list_settings(self):
+        settings = self._get('/api/settings')
+        self.assertGreater(len(settings), 10)
+        self.assertSchema(
+            settings,
+            JList(
+                JObj({
+                    'default': JAny(none=False),
+                    'name': str,
+                    'type': str,
+                    'value': JAny(none=False)
+                })))
+        self.assertStatus(200)
+
+    def test_get_setting(self):
+        setting = self._get('/api/settings/rgw-api-access-key')
+        self.assertSchema(
+            setting,
+            JObj({
+                'default': JAny(none=False),
+                'name': str,
+                'type': str,
+                'value': JAny(none=False)
+            }))
+        self.assertStatus(200)
+
+    def test_set_setting(self):
+        self._put('/api/settings/rgw-api-access-key', {'value': 'foo'})
+        self.assertStatus(200)
+
+        value = self._get('/api/settings/rgw-api-access-key')['value']
+        self.assertEqual('foo', value)
+
+    def test_bulk_set(self):
+        self._put('/api/settings', {
+            'RGW_API_ACCESS_KEY': 'dummy-key',
+            'RGW_API_SECRET_KEY': 'dummy-secret',
+        })
+        self.assertStatus(200)
+
+        access_key = self._get('/api/settings/rgw-api-access-key')['value']
+        self.assertStatus(200)
+        self.assertEqual('dummy-key', access_key)
+
+        secret_key = self._get('/api/settings/rgw-api-secret-key')['value']
+        self.assertStatus(200)
+        self.assertEqual('dummy-secret', secret_key)
diff --git a/qa/tasks/mgr/dashboard/test_summary.py b/qa/tasks/mgr/dashboard/test_summary.py
new file mode 100644
index 000000000..a31f89146
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_summary.py
@@ -0,0 +1,39 @@
+from __future__ import absolute_import
+
+from .helper import DashboardTestCase
+
+
+class SummaryTest(DashboardTestCase):
+    CEPHFS = True
+
+    def test_summary(self):
+        data = self._get("/api/summary")
+        self.assertStatus(200)
+
+        self.assertIn('health_status', data)
+        self.assertIn('mgr_id', data)
+        self.assertIn('have_mon_connection', data)
+        self.assertIn('rbd_mirroring', data)
+        self.assertIn('executing_tasks', data)
+        self.assertIn('finished_tasks', data)
+        self.assertIn('version', data)
+        self.assertIsNotNone(data['health_status'])
+        self.assertIsNotNone(data['mgr_id'])
+        self.assertIsNotNone(data['have_mon_connection'])
+        self.assertEqual(data['rbd_mirroring'], {'errors': 0, 'warnings': 0})
+
+    @DashboardTestCase.RunAs('test', 'test', ['pool-manager'])
+    def test_summary_permissions(self):
+        data = self._get("/api/summary")
+        self.assertStatus(200)
+
+        self.assertIn('health_status', data)
+        self.assertIn('mgr_id', data)
+        self.assertIn('have_mon_connection', data)
+        self.assertNotIn('rbd_mirroring', data)
+        self.assertIn('executing_tasks', data)
+        self.assertIn('finished_tasks', data)
+        self.assertIn('version', data)
+        self.assertIsNotNone(data['health_status'])
+        self.assertIsNotNone(data['mgr_id'])
+        self.assertIsNotNone(data['have_mon_connection'])
diff --git a/qa/tasks/mgr/dashboard/test_telemetry.py b/qa/tasks/mgr/dashboard/test_telemetry.py
new file mode 100644
index 000000000..65c62c748
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_telemetry.py
@@ -0,0 +1,98 @@
+from .helper import DashboardTestCase, JObj
+
+
+class TelemetryTest(DashboardTestCase):
+
+    pre_enabled_status = True
+
+    @classmethod
+    def setUpClass(cls):
+        super(TelemetryTest, cls).setUpClass()
+        data = cls._get('/api/mgr/module/telemetry')
+        cls.pre_enabled_status = data['enabled']
+
+        # identify ourselves so we can filter these reports out on the server side
+        cls._put(
+            '/api/settings',
+            {
+                'mgr/telemetry/channel_ident': True,
+                'mgr/telemetry/organization': 'ceph-qa',
+            }
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        if cls.pre_enabled_status:
+            cls._enable_module()
+        else:
+            cls._disable_module()
+        super(TelemetryTest, cls).tearDownClass()
+
+    def test_disable_module(self):
+        self._enable_module()
+        self._check_telemetry_enabled(True)
+        self._disable_module()
+        self._check_telemetry_enabled(False)
+
+    def test_enable_module_correct_license(self):
+        self._disable_module()
+        self._check_telemetry_enabled(False)
+
+        self._put('/api/telemetry', {
+            'enable': True,
+            'license_name': 'sharing-1-0'
+        })
+        self.assertStatus(200)
+        self._check_telemetry_enabled(True)
+
+    def test_enable_module_empty_license(self):
+        self._disable_module()
+        self._check_telemetry_enabled(False)
+
+        self._put('/api/telemetry', {
+            'enable': True,
+            'license_name': ''
+        })
+        self.assertStatus(400)
+        self.assertError(code='telemetry_enable_license_missing')
+        self._check_telemetry_enabled(False)
+
+    def test_enable_module_invalid_license(self):
+        self._disable_module()
+        self._check_telemetry_enabled(False)
+
+        self._put('/api/telemetry', {
+            'enable': True,
+            'license_name': 'invalid-license'
+        })
+        self.assertStatus(400)
+        self.assertError(code='telemetry_enable_license_missing')
+        self._check_telemetry_enabled(False)
+
+    def test_get_report(self):
+        self._enable_module()
+        data = self._get('/api/telemetry/report')
+        self.assertStatus(200)
+        schema = JObj({
+            'report': JObj({}, allow_unknown=True),
+            'device_report': JObj({}, allow_unknown=True)
+        })
+        self.assertSchema(data, schema)
+
+    @classmethod
+    def _enable_module(cls):
+        cls._put('/api/telemetry', {
+            'enable': True,
+            'license_name': 'sharing-1-0'
+        })
+
+    @classmethod
+    def _disable_module(cls):
+        cls._put('/api/telemetry', {
+            'enable': False
+        })
+
+    def _check_telemetry_enabled(self, enabled):
+        data = self._get('/api/mgr/module/telemetry')
+        self.assertStatus(200)
+        self.assertEqual(data['enabled'], enabled)
diff --git a/qa/tasks/mgr/dashboard/test_user.py b/qa/tasks/mgr/dashboard/test_user.py
new file mode 100644
index 000000000..3a6464f5a
--- /dev/null
+++ b/qa/tasks/mgr/dashboard/test_user.py
@@ -0,0 +1,565 @@
+# -*- coding: utf-8 -*-
+# pylint: disable=too-many-public-methods
+
+from __future__ import absolute_import
+
+import time
+from datetime import datetime, timedelta
+
+from .helper import DashboardTestCase
+
+
+class UserTest(DashboardTestCase):
+    @classmethod
+    def setUpClass(cls):
+        super(UserTest, cls).setUpClass()
+        cls._ceph_cmd(['dashboard', 'set-pwd-policy-enabled', 'true'])
+        cls._ceph_cmd(['dashboard', 'set-pwd-policy-check-length-enabled', 'true'])
+        cls._ceph_cmd(['dashboard', 'set-pwd-policy-check-oldpwd-enabled', 'true'])
+        cls._ceph_cmd(['dashboard', 'set-pwd-policy-check-username-enabled', 'true'])
+        cls._ceph_cmd(['dashboard', 'set-pwd-policy-check-exclusion-list-enabled', 'true'])
+        cls._ceph_cmd(['dashboard', 'set-pwd-policy-check-complexity-enabled', 'true'])
+        cls._ceph_cmd(['dashboard', 'set-pwd-policy-check-sequential-chars-enabled', 'true'])
+        cls._ceph_cmd(['dashboard', 'set-pwd-policy-check-repetitive-chars-enabled', 'true'])
+
+    @classmethod
+    def tearDownClass(cls):
+        cls._ceph_cmd(['dashboard', 'set-pwd-policy-check-username-enabled', 'false'])
+        cls._ceph_cmd(['dashboard', 'set-pwd-policy-check-exclusion-list-enabled', 'false'])
+        cls._ceph_cmd(['dashboard', 'set-pwd-policy-check-complexity-enabled', 'false'])
+        cls._ceph_cmd(['dashboard', 'set-pwd-policy-check-sequential-chars-enabled', 'false'])
+        cls._ceph_cmd(['dashboard', 'set-pwd-policy-check-repetitive-chars-enabled', 'false'])
+        super(UserTest, cls).tearDownClass()
+
+    @classmethod
+    def _create_user(cls, username=None, password=None, name=None, email=None, roles=None,
+                     enabled=True, pwd_expiration_date=None, pwd_update_required=False):
+        data = {}
+        if username:
+            data['username'] = username
+        if password:
+            data['password'] = password
+        if name:
+            data['name'] = name
+        if email:
+            data['email'] = email
+        if roles:
+            data['roles'] = roles
+        if pwd_expiration_date:
+            data['pwdExpirationDate'] = pwd_expiration_date
+        data['pwdUpdateRequired'] = pwd_update_required
+        data['enabled'] = enabled
+        cls._post("/api/user", data)
+
+    @classmethod
+    def _reset_login_to_admin(cls, username=None):
+        cls.logout()
+        if username:
+            cls.delete_user(username)
+        cls.login('admin', 'admin')
+
+    def test_crud_user(self):
+        self._create_user(username='user1',
+                          password='mypassword10#',
+                          name='My Name',
+                          email='my@email.com',
+                          roles=['administrator'])
+        self.assertStatus(201)
+        user = self.jsonBody()
+
+        self._get('/api/user/user1')
+        self.assertStatus(200)
+        self.assertJsonBody({
+            'username': 'user1',
+            'name': 'My Name',
+            'email': 'my@email.com',
+            'roles': ['administrator'],
+            'lastUpdate': user['lastUpdate'],
+            'enabled': True,
+            'pwdExpirationDate': None,
+            'pwdUpdateRequired': False
+        })
+
+        self._put('/api/user/user1', {
+            'name': 'My New Name',
+            'email': 'mynew@email.com',
+            'roles': ['block-manager'],
+        })
+        self.assertStatus(200)
+        user = self.jsonBody()
+        self.assertJsonBody({
+            'username': 'user1',
+            'name': 'My New Name',
+            'email': 'mynew@email.com',
+            'roles': ['block-manager'],
+            'lastUpdate': user['lastUpdate'],
+            'enabled': True,
+            'pwdExpirationDate': None,
+            'pwdUpdateRequired': False
+        })
+
+        self._delete('/api/user/user1')
+        self.assertStatus(204)
+
+    def test_crd_disabled_user(self):
+        self._create_user(username='klara',
+                          password='mypassword10#',
+                          name='Klara Musterfrau',
+                          email='klara@musterfrau.com',
+                          roles=['administrator'],
+                          enabled=False)
+        self.assertStatus(201)
+        user = self.jsonBody()
+
+        # Restart dashboard module.
+        self._unload_module('dashboard')
+        self._load_module('dashboard')
+        time.sleep(10)
+
+        self._get('/api/user/klara')
+        self.assertStatus(200)
+        self.assertJsonBody({
+            'username': 'klara',
+            'name': 'Klara Musterfrau',
+            'email': 'klara@musterfrau.com',
+            'roles': ['administrator'],
+            'lastUpdate': user['lastUpdate'],
+            'enabled': False,
+            'pwdExpirationDate': None,
+            'pwdUpdateRequired': False
+        })
+
+        self._delete('/api/user/klara')
+        self.assertStatus(204)
+
+    def test_list_users(self):
+        self._get('/api/user')
+        self.assertStatus(200)
+        user = self.jsonBody()
+        self.assertEqual(len(user), 1)
+        user = user[0]
+        self.assertJsonBody([{
+            'username': 'admin',
+            'name': None,
+            'email': None,
+            'roles': ['administrator'],
+            'lastUpdate': user['lastUpdate'],
+            'enabled': True,
+            'pwdExpirationDate': None,
+            'pwdUpdateRequired': False
+        }])
+
+    def test_create_user_already_exists(self):
+        self._create_user(username='admin',
+                          password='mypassword10#',
+                          name='administrator',
+                          email='my@email.com',
+                          roles=['administrator'])
+        self.assertStatus(400)
+        self.assertError(code='username_already_exists',
+                         component='user')
+
+    def test_create_user_invalid_role(self):
+        self._create_user(username='user1',
+                          password='mypassword10#',
+                          name='My Name',
+                          email='my@email.com',
+                          roles=['invalid-role'])
+        self.assertStatus(400)
+        self.assertError(code='role_does_not_exist',
+                         component='user')
+
+    def test_create_user_invalid_chars_in_name(self):
+        self._create_user(username='userö',
+                          password='mypassword10#',
+                          name='administrator',
+                          email='my@email.com',
+                          roles=['administrator'])
+        self.assertStatus(400)
+        self.assertError(code='ceph_type_not_valid',
+                         component='user')
+
+    def test_delete_user_does_not_exist(self):
+        self._delete('/api/user/user2')
+        self.assertStatus(404)
+
+    @DashboardTestCase.RunAs('test', 'test', [{'user': ['create', 'read', 'update', 'delete']}])
+    def test_delete_current_user(self):
+        self._delete('/api/user/test')
+        self.assertStatus(400)
+        self.assertError(code='cannot_delete_current_user',
+                         component='user')
+
+    @DashboardTestCase.RunAs('test', 'test', [{'user': ['create', 'read', 'update', 'delete']}])
+    def test_disable_current_user(self):
+        self._put('/api/user/test', {'enabled': False})
+        self.assertStatus(400)
+        self.assertError(code='cannot_disable_current_user',
+                         component='user')
+
+    def test_update_user_does_not_exist(self):
+        self._put('/api/user/user2', {'name': 'My New Name'})
+        self.assertStatus(404)
+
+    def test_update_user_invalid_role(self):
+        self._put('/api/user/admin', {'roles': ['invalid-role']})
+        self.assertStatus(400)
+        self.assertError(code='role_does_not_exist',
+                         component='user')
+
+    def test_change_password_from_other_user(self):
+        self._post('/api/user/test2/change_password', {
+            'old_password': 'abc',
+            'new_password': 'xyz'
+        })
+        self.assertStatus(400)
+        self.assertError(code='invalid_user_context', component='user')
+
+    def test_change_password_old_not_match(self):
+        self._post('/api/user/admin/change_password', {
+            'old_password': 'foo',
+            'new_password': 'bar'
+        })
+        self.assertStatus(400)
+        self.assertError(code='invalid_old_password', component='user')
+
+    def test_change_password_as_old_password(self):
+        self.create_user('test1', 'mypassword10#', ['read-only'], force_password=False)
+        self.login('test1', 'mypassword10#')
+        self._post('/api/user/test1/change_password', {
+            'old_password': 'mypassword10#',
+            'new_password': 'mypassword10#'
+        })
+        self.assertStatus(400)
+        self.assertError('password_policy_validation_failed', 'user',
+                         'Password must not be the same as the previous one.')
+        self._reset_login_to_admin('test1')
+
+    def test_change_password_contains_username(self):
+        self.create_user('test1', 'mypassword10#', ['read-only'], force_password=False)
+        self.login('test1', 'mypassword10#')
+        self._post('/api/user/test1/change_password', {
+            'old_password': 'mypassword10#',
+            'new_password': 'mypasstest1@#'
+        })
+        self.assertStatus(400)
+        self.assertError('password_policy_validation_failed', 'user',
+                         'Password must not contain username.')
+        self._reset_login_to_admin('test1')
+
+    def test_change_password_contains_forbidden_words(self):
+        self.create_user('test1', 'mypassword10#', ['read-only'], force_password=False)
+        self.login('test1', 'mypassword10#')
+        self._post('/api/user/test1/change_password', {
+            'old_password': 'mypassword10#',
+            'new_password': 'mypassOSD01'
+        })
+        self.assertStatus(400)
+        self.assertError('password_policy_validation_failed', 'user',
+                         'Password must not contain the keyword "OSD".')
+        self._reset_login_to_admin('test1')
+
+    def test_change_password_contains_sequential_characters(self):
+        self.create_user('test1', 'mypassword10#', ['read-only'], force_password=False)
+        self.login('test1', 'mypassword10#')
+        self._post('/api/user/test1/change_password', {
+            'old_password': 'mypassword10#',
+            'new_password': 'mypass123456!@$'
+        })
+        self.assertStatus(400)
+        self.assertError('password_policy_validation_failed', 'user',
+                         'Password must not contain sequential characters.')
+        self._reset_login_to_admin('test1')
+
+    def test_change_password_contains_repetetive_characters(self):
+        self.create_user('test1', 'mypassword10#', ['read-only'], force_password=False)
+        self.login('test1', 'mypassword10#')
+        self._post('/api/user/test1/change_password', {
+            'old_password': 'mypassword10#',
+            'new_password': 'aaaaA1@!#'
+        })
+        self.assertStatus(400)
+        self.assertError('password_policy_validation_failed', 'user',
+                         'Password must not contain repetitive characters.')
+        self._reset_login_to_admin('test1')
+
+    @DashboardTestCase.RunAs('test1', 'mypassword10#', ['read-only'], False)
+    def test_change_password(self):
+        self._post('/api/user/test1/change_password', {
+            'old_password': 'mypassword10#',
+            'new_password': 'newpassword01#'
+        })
+        self.assertStatus(200)
+        self.logout()
+        self._post('/api/auth', {'username': 'test1', 'password': 'mypassword10#'})
+        self.assertStatus(400)
+        self.assertError(code='invalid_credentials', component='auth')
+
+    def test_create_user_password_cli(self):
+        exitcode = self._ceph_cmd_with_secret(['dashboard', 'ac-user-create',
+                                               'test1'],
+                                              'mypassword10#',
+                                              return_exit_code=True)
+        self.assertEqual(exitcode, 0)
+        self.delete_user('test1')
+
+    @DashboardTestCase.RunAs('test2', 'foo_bar_10#', force_password=False, login=False)
+    def test_change_user_password_cli(self):
+        exitcode = self._ceph_cmd_with_secret(['dashboard', 'ac-user-set-password',
+                                               'test2'],
+                                              'foo_new-password01#',
+                                              return_exit_code=True)
+        self.assertEqual(exitcode, 0)
+
+    def test_create_user_password_force_cli(self):
+        exitcode = self._ceph_cmd_with_secret(['dashboard', 'ac-user-create',
+                                               '--force-password', 'test11'],
+                                              'bar',
+                                              return_exit_code=True)
+        self.assertEqual(exitcode, 0)
+        self.delete_user('test11')
+
+    @DashboardTestCase.RunAs('test22', 'foo_bar_10#', force_password=False, login=False)
+    def test_change_user_password_force_cli(self):
+        exitcode = self._ceph_cmd_with_secret(['dashboard', 'ac-user-set-password',
+                                               '--force-password', 'test22'],
+                                              'bar',
+                                              return_exit_code=True)
+        self.assertEqual(exitcode, 0)
+
+    def test_create_user_password_cli_fail(self):
+        exitcode = self._ceph_cmd_with_secret(['dashboard', 'ac-user-create',
+                                               'test3'],
+                                              'foo',
+                                              return_exit_code=True)
+        self.assertNotEqual(exitcode, 0)
+
+    @DashboardTestCase.RunAs('test4', 'x1z_tst+_10#', force_password=False, login=False)
+    def test_change_user_password_cli_fail(self):
+        exitcode = self._ceph_cmd_with_secret(['dashboard', 'ac-user-set-password',
+                                               'test4'],
+                                              'bar',
+                                              return_exit_code=True)
+        self.assertNotEqual(exitcode, 0)
+
+    def test_create_user_with_pwd_expiration_date(self):
+        future_date = datetime.utcnow() + timedelta(days=10)
+        future_date = int(time.mktime(future_date.timetuple()))
+
+        self._create_user(username='user1',
+                          password='mypassword10#',
+                          name='My Name',
+                          email='my@email.com',
+                          roles=['administrator'],
+                          pwd_expiration_date=future_date)
+        self.assertStatus(201)
+        user = self.jsonBody()
+
+        self._get('/api/user/user1')
+        self.assertStatus(200)
+        self.assertJsonBody({
+            'username': 'user1',
+            'name': 'My Name',
+            'email': 'my@email.com',
+            'roles': ['administrator'],
+            'lastUpdate': user['lastUpdate'],
+            'enabled': True,
+            'pwdExpirationDate': future_date,
+            'pwdUpdateRequired': False
+        })
+        self._delete('/api/user/user1')
+
+    def test_create_with_pwd_expiration_date_not_valid(self):
+        past_date = datetime.utcnow() - timedelta(days=10)
+        past_date = int(time.mktime(past_date.timetuple()))
+
+        self._create_user(username='user1',
+                          password='mypassword10#',
+                          name='My Name',
+                          email='my@email.com',
+                          roles=['administrator'],
+                          pwd_expiration_date=past_date)
+        self.assertStatus(400)
+        self.assertError(code='pwd_past_expiration_date', component='user')
+
+    def test_create_with_default_expiration_date(self):
+        future_date_1 = datetime.utcnow() + timedelta(days=9)
+        future_date_1 = int(time.mktime(future_date_1.timetuple()))
+        future_date_2 = datetime.utcnow() + timedelta(days=11)
+        future_date_2 = int(time.mktime(future_date_2.timetuple()))
+
+        self._ceph_cmd(['dashboard', 'set-user-pwd-expiration-span', '10'])
+        self._create_user(username='user1',
+                          password='mypassword10#',
+                          name='My Name',
+                          email='my@email.com',
+                          roles=['administrator'])
+        self.assertStatus(201)
+
+        user = self._get('/api/user/user1')
+        self.assertStatus(200)
+        self.assertIsNotNone(user['pwdExpirationDate'])
+        self.assertGreater(user['pwdExpirationDate'], future_date_1)
+        self.assertLess(user['pwdExpirationDate'], future_date_2)
+
+        self._delete('/api/user/user1')
+        self._ceph_cmd(['dashboard', 'set-user-pwd-expiration-span', '0'])
+
+    def test_pwd_expiration_date_update(self):
+        self._ceph_cmd(['dashboard', 'set-user-pwd-expiration-span', '10'])
+        self.create_user('user1', 'mypassword10#', ['administrator'])
+
+        user_1 = self._get('/api/user/user1')
+        self.assertStatus(200)
+
+        # Let's wait 1 s to ensure pwd expiration date is not the same
+        time.sleep(1)
+
+        self.login('user1', 'mypassword10#')
+        self._post('/api/user/user1/change_password', {
+            'old_password': 'mypassword10#',
+            'new_password': 'newpassword01#'
+        })
+        self.assertStatus(200)
+
+        # Compare password expiration dates.
+        self._reset_login_to_admin()
+        user_1_pwd_changed = self._get('/api/user/user1')
+        self.assertStatus(200)
+        self.assertLess(user_1['pwdExpirationDate'], user_1_pwd_changed['pwdExpirationDate'])
+
+        # Cleanup
+        self.delete_user('user1')
+        self._ceph_cmd(['dashboard', 'set-user-pwd-expiration-span', '0'])
+
+    def test_pwd_update_required(self):
+        self._create_user(username='user1',
+                          password='mypassword10#',
+                          name='My Name',
+                          email='my@email.com',
+                          roles=['administrator'],
+                          pwd_update_required=True)
+        self.assertStatus(201)
+
+        user_1 = self._get('/api/user/user1')
+        self.assertStatus(200)
+        self.assertEqual(user_1['pwdUpdateRequired'], True)
+
+        self.login('user1', 'mypassword10#')
+        self.assertStatus(201)
+
+        self._get('/api/osd')
+        self.assertStatus(403)
+        self._reset_login_to_admin('user1')
+
+    def test_pwd_update_required_change_pwd(self):
+        self._create_user(username='user1',
+                          password='mypassword10#',
+                          name='My Name',
+                          email='my@email.com',
+                          roles=['administrator'],
+                          pwd_update_required=True)
+        self.assertStatus(201)
+
+        self.login('user1', 'mypassword10#')
+        self._post('/api/user/user1/change_password', {
+            'old_password': 'mypassword10#',
+            'new_password': 'newpassword01#'
+        })
+
+        self.login('user1', 'newpassword01#')
+        user_1 = self._get('/api/user/user1')
+        self.assertStatus(200)
+        self.assertEqual(user_1['pwdUpdateRequired'], False)
+        self._get('/api/osd')
+        self.assertStatus(200)
+        self._reset_login_to_admin('user1')
+
+    def test_validate_password_weak(self):
+        self._post('/api/user/validate_password', {
+            'password': 'mypassword1'
+        })
+        self.assertStatus(200)
+        self.assertJsonBody({
+            'valid': True,
+            'credits': 11,
+            'valuation': 'Weak'
+        })
+
+    def test_validate_password_ok(self):
+        self._post('/api/user/validate_password', {
+            'password': 'mypassword1!@'
+        })
+        self.assertStatus(200)
+        self.assertJsonBody({
+            'valid': True,
+            'credits': 17,
+            'valuation': 'OK'
+        })
+
+    def test_validate_password_strong(self):
+        self._post('/api/user/validate_password', {
+            'password': 'testpassword0047!@'
+        })
+        self.assertStatus(200)
+        self.assertJsonBody({
+            'valid': True,
+            'credits': 22,
+            'valuation': 'Strong'
+        })
+
+    def test_validate_password_very_strong(self):
+        self._post('/api/user/validate_password', {
+            'password': 'testpassword#!$!@$'
+        })
+        self.assertStatus(200)
+        self.assertJsonBody({
+            'valid': True,
+            'credits': 30,
+            'valuation': 'Very strong'
+        })
+
+    def test_validate_password_fail(self):
+        self._post('/api/user/validate_password', {
+            'password': 'foo'
+        })
+        self.assertStatus(200)
+        self.assertJsonBody({
+            'valid': False,
+            'credits': 0,
+            'valuation': 'Password is too weak.'
+        })
+
+    def test_validate_password_fail_name(self):
+        self._post('/api/user/validate_password', {
+            'password': 'x1zhugo_10',
+            'username': 'hugo'
+        })
+        self.assertStatus(200)
+        self.assertJsonBody({
+            'valid': False,
+            'credits': 0,
+            'valuation': 'Password must not contain username.'
+        })
+
+    def test_validate_password_fail_oldpwd(self):
+        self._post('/api/user/validate_password', {
+            'password': 'x1zt-st10',
+            'old_password': 'x1zt-st10'
+        })
+        self.assertStatus(200)
+        self.assertJsonBody({
+            'valid': False,
+            'credits': 0,
+            'valuation': 'Password must not be the same as the previous one.'
+        })
+
+    def test_create_user_pwd_update_required(self):
+        self.create_user('foo', 'bar', cmd_args=['--pwd_update_required'])
+        self._get('/api/user/foo')
+        self.assertStatus(200)
+        self.assertJsonSubset({
+            'username': 'foo',
+            'pwdUpdateRequired': True
+        })
+        self.delete_user('foo')
diff --git a/qa/tasks/mgr/mgr_test_case.py b/qa/tasks/mgr/mgr_test_case.py
new file mode 100644
index 000000000..f5392d3ba
--- /dev/null
+++ b/qa/tasks/mgr/mgr_test_case.py
@@ -0,0 +1,220 @@
+import json
+import logging
+
+from unittest import SkipTest
+
+from teuthology import misc
+from tasks.ceph_test_case import CephTestCase
+
+# TODO move definition of CephCluster away from the CephFS stuff
+from tasks.cephfs.filesystem import CephCluster
+
+
+log = logging.getLogger(__name__)
+
+
+class MgrCluster(CephCluster):
+    def __init__(self, ctx):
+        super(MgrCluster, self).__init__(ctx)
+        self.mgr_ids = list(misc.all_roles_of_type(ctx.cluster, 'mgr'))
+
+        if len(self.mgr_ids) == 0:
+            raise RuntimeError(
+                "This task requires at least one manager daemon")
+
+        self.mgr_daemons = dict(
+            [(mgr_id, self._ctx.daemons.get_daemon('mgr', mgr_id)) for mgr_id
+             in self.mgr_ids])
+
+    def mgr_stop(self, mgr_id):
+        self.mgr_daemons[mgr_id].stop()
+
+    def mgr_fail(self, mgr_id):
+        self.mon_manager.raw_cluster_cmd("mgr", "fail", mgr_id)
+
+    def mgr_restart(self, mgr_id):
+        self.mgr_daemons[mgr_id].restart()
+
+    def get_mgr_map(self):
+        return json.loads(
+            self.mon_manager.raw_cluster_cmd("mgr", "dump", "--format=json-pretty"))
+
+    def get_active_id(self):
+        return self.get_mgr_map()["active_name"]
+
+    def get_standby_ids(self):
+        return [s['name'] for s in self.get_mgr_map()["standbys"]]
+
+    def set_module_conf(self, module, key, val):
+        self.mon_manager.raw_cluster_cmd("config", "set", "mgr",
+                                         "mgr/{0}/{1}".format(
+                                             module, key
+                                         ), val)
+
+    def set_module_localized_conf(self, module, mgr_id, key, val, force):
+        cmd = ["config", "set", "mgr",
+               "/".join(["mgr", module, mgr_id, key]),
+               val]
+        if force:
+            cmd.append("--force")
+        self.mon_manager.raw_cluster_cmd(*cmd)
+
+
+class MgrTestCase(CephTestCase):
+    MGRS_REQUIRED = 1
+
+    @classmethod
+    def setup_mgrs(cls):
+        # Stop all the daemons
+        for daemon in cls.mgr_cluster.mgr_daemons.values():
+            daemon.stop()
+
+        for mgr_id in cls.mgr_cluster.mgr_ids:
+            cls.mgr_cluster.mgr_fail(mgr_id)
+
+        # Unload all non-default plugins
+        loaded = json.loads(cls.mgr_cluster.mon_manager.raw_cluster_cmd(
+                   "mgr", "module", "ls"))['enabled_modules']
+        unload_modules = set(loaded) - {"cephadm", "restful"}
+
+        for m in unload_modules:
+            cls.mgr_cluster.mon_manager.raw_cluster_cmd(
+                "mgr", "module", "disable", m)
+
+        # Start all the daemons
+        for daemon in cls.mgr_cluster.mgr_daemons.values():
+            daemon.restart()
+
+        # Wait for an active to come up
+        cls.wait_until_true(lambda: cls.mgr_cluster.get_active_id() != "",
+                             timeout=20)
+
+        expect_standbys = set(cls.mgr_cluster.mgr_ids) \
+                          - {cls.mgr_cluster.get_active_id()}
+        cls.wait_until_true(
+            lambda: set(cls.mgr_cluster.get_standby_ids()) == expect_standbys,
+            timeout=20)
+
+    @classmethod
+    def setUpClass(cls):
+        # The test runner should have populated this
+        assert cls.mgr_cluster is not None
+
+        if len(cls.mgr_cluster.mgr_ids) < cls.MGRS_REQUIRED:
+            raise SkipTest(
+                "Only have {0} manager daemons, {1} are required".format(
+                    len(cls.mgr_cluster.mgr_ids), cls.MGRS_REQUIRED))
+
+        cls.setup_mgrs()
+
+    @classmethod
+    def _unload_module(cls, module_name):
+        def is_disabled():
+            enabled_modules = json.loads(cls.mgr_cluster.mon_manager.raw_cluster_cmd(
+                'mgr', 'module', 'ls'))['enabled_modules']
+            return module_name not in enabled_modules
+
+        if is_disabled():
+            return
+
+        log.debug("Unloading Mgr module %s ...", module_name)
+        cls.mgr_cluster.mon_manager.raw_cluster_cmd('mgr', 'module', 'disable', module_name)
+        cls.wait_until_true(is_disabled, timeout=30)
+
+    @classmethod
+    def _load_module(cls, module_name):
+        loaded = json.loads(cls.mgr_cluster.mon_manager.raw_cluster_cmd(
+            "mgr", "module", "ls"))['enabled_modules']
+        if module_name in loaded:
+            # The enable command is idempotent, but our wait for a restart
+            # isn't, so let's return now if it's already loaded
+            return
+
+        initial_mgr_map = cls.mgr_cluster.get_mgr_map()
+
+        # check if the the module is configured as an always on module
+        mgr_daemons = json.loads(cls.mgr_cluster.mon_manager.raw_cluster_cmd(
+            "mgr", "metadata"))
+
+        for daemon in mgr_daemons:
+            if daemon["name"] == initial_mgr_map["active_name"]:
+                ceph_version = daemon["ceph_release"]
+                always_on = initial_mgr_map["always_on_modules"].get(ceph_version, [])
+                if module_name in always_on:
+                    return
+
+        log.debug("Loading Mgr module %s ...", module_name)
+        initial_gid = initial_mgr_map['active_gid']
+        cls.mgr_cluster.mon_manager.raw_cluster_cmd(
+            "mgr", "module", "enable", module_name, "--force")
+
+        # Wait for the module to load
+        def has_restarted():
+            mgr_map = cls.mgr_cluster.get_mgr_map()
+            done = mgr_map['active_gid'] != initial_gid and mgr_map['available']
+            if done:
+                log.debug("Restarted after module load (new active {0}/{1})".format(
+                    mgr_map['active_name'], mgr_map['active_gid']))
+            return done
+        cls.wait_until_true(has_restarted, timeout=30)
+
+
+    @classmethod
+    def _get_uri(cls, service_name):
+        # Little dict hack so that I can assign into this from
+        # the get_or_none function
+        mgr_map = {'x': None}
+
+        def _get_or_none():
+            mgr_map['x'] = cls.mgr_cluster.get_mgr_map()
+            result = mgr_map['x']['services'].get(service_name, None)
+            return result
+
+        cls.wait_until_true(lambda: _get_or_none() is not None, 30)
+
+        uri = mgr_map['x']['services'][service_name]
+
+        log.debug("Found {0} at {1} (daemon {2}/{3})".format(
+            service_name, uri, mgr_map['x']['active_name'],
+            mgr_map['x']['active_gid']))
+
+        return uri
+
+    @classmethod
+    def _assign_ports(cls, module_name, config_name, min_port=7789):
+        """
+        To avoid the need to run lots of hosts in teuthology tests to
+        get different URLs per mgr, we will hand out different ports
+        to each mgr here.
+
+        This is already taken care of for us when running in a vstart
+        environment.
+        """
+        # Start handing out ports well above Ceph's range.
+        assign_port = min_port
+
+        for mgr_id in cls.mgr_cluster.mgr_ids:
+            cls.mgr_cluster.mgr_stop(mgr_id)
+            cls.mgr_cluster.mgr_fail(mgr_id)
+
+        for mgr_id in cls.mgr_cluster.mgr_ids:
+            log.debug("Using port {0} for {1} on mgr.{2}".format(
+                assign_port, module_name, mgr_id
+            ))
+            cls.mgr_cluster.set_module_localized_conf(module_name, mgr_id,
+                                                      config_name,
+                                                      str(assign_port),
+                                                      force=True)
+            assign_port += 1
+
+        for mgr_id in cls.mgr_cluster.mgr_ids:
+            cls.mgr_cluster.mgr_restart(mgr_id)
+
+        def is_available():
+            mgr_map = cls.mgr_cluster.get_mgr_map()
+            done = mgr_map['available']
+            if done:
+                log.debug("Available after assign ports (new active {0}/{1})".format(
+                    mgr_map['active_name'], mgr_map['active_gid']))
+            return done
+        cls.wait_until_true(is_available, timeout=30)
diff --git a/qa/tasks/mgr/test_cache.py b/qa/tasks/mgr/test_cache.py
new file mode 100644
index 000000000..71131cbc6
--- /dev/null
+++ b/qa/tasks/mgr/test_cache.py
@@ -0,0 +1,83 @@
+import json
+
+from .mgr_test_case import MgrTestCase
+
+class TestCache(MgrTestCase):
+
+    def setUp(self):
+        super(TestCache, self).setUp()
+        self.setup_mgrs()
+        self._load_module("cli_api")
+        self.ttl = 10
+        self.enable_cache(self.ttl)
+
+    def tearDown(self):
+        self.disable_cache()
+
+    def get_hit_miss_ratio(self):
+        perf_dump_command = f"daemon mgr.{self.mgr_cluster.get_active_id()} perf dump"
+        perf_dump_res = self.cluster_cmd(perf_dump_command)
+        perf_dump = json.loads(perf_dump_res)
+        h = perf_dump["mgr"]["cache_hit"]
+        m = perf_dump["mgr"]["cache_miss"]
+        return int(h), int(m)
+
+    def enable_cache(self, ttl):
+        set_ttl = f"config set mgr mgr_ttl_cache_expire_seconds {ttl}"
+        self.cluster_cmd(set_ttl)
+
+    def disable_cache(self):
+        set_ttl = "config set mgr mgr_ttl_cache_expire_seconds 0"
+        self.cluster_cmd(set_ttl)
+
+
+    def test_init_cache(self):
+        get_ttl = "config get mgr mgr_ttl_cache_expire_seconds"
+        res = self.cluster_cmd(get_ttl)
+        self.assertEquals(int(res), 10)
+
+    def test_health_not_cached(self):
+        get_health = "mgr api get health"
+
+        h_start, m_start = self.get_hit_miss_ratio()
+        self.cluster_cmd(get_health)
+        h, m = self.get_hit_miss_ratio()
+
+        self.assertEquals(h, h_start)
+        self.assertEquals(m, m_start)
+
+    def test_osdmap(self):
+        get_osdmap = "mgr api get osd_map"
+
+        # store in cache
+        self.cluster_cmd(get_osdmap)
+        # get from cache
+        res = self.cluster_cmd(get_osdmap)
+        osd_map = json.loads(res)
+        self.assertIn("osds", osd_map)
+        self.assertGreater(len(osd_map["osds"]), 0)
+        self.assertIn("epoch", osd_map)
+
+
+
+    def test_hit_miss_ratio(self):
+        get_osdmap = "mgr api get osd_map"
+
+        hit_start, miss_start = self.get_hit_miss_ratio()
+
+        def wait_miss():
+            self.cluster_cmd(get_osdmap)
+            _, m = self.get_hit_miss_ratio()
+            return m == miss_start + 1
+
+        # Miss, add osd_map to cache
+        self.wait_until_true(wait_miss, self.ttl + 5)
+        h, m = self.get_hit_miss_ratio()
+        self.assertEquals(h, hit_start)
+        self.assertEquals(m, miss_start+1)
+
+        # Hit, get osd_map from cache
+        self.cluster_cmd(get_osdmap)
+        h, m = self.get_hit_miss_ratio()
+        self.assertEquals(h, hit_start+1)
+        self.assertEquals(m, miss_start+1)
diff --git a/qa/tasks/mgr/test_crash.py b/qa/tasks/mgr/test_crash.py
new file mode 100644
index 000000000..49191127f
--- /dev/null
+++ b/qa/tasks/mgr/test_crash.py
@@ -0,0 +1,108 @@
+import json
+import logging
+import datetime
+
+from .mgr_test_case import MgrTestCase
+
+
+log = logging.getLogger(__name__)
+UUID = 'd5775432-0742-44a3-a435-45095e32e6b1'
+DATEFMT = '%Y-%m-%d %H:%M:%S.%f'
+
+
+class TestCrash(MgrTestCase):
+
+    def setUp(self):
+        super(TestCrash, self).setUp()
+        self.setup_mgrs()
+        self._load_module('crash')
+
+        # Whip up some crash data
+        self.crashes = dict()
+        now = datetime.datetime.utcnow()
+
+        for i in (0, 1, 3, 4, 8):
+            timestamp = now - datetime.timedelta(days=i)
+            timestamp = timestamp.strftime(DATEFMT) + 'Z'
+            crash_id = '_'.join((timestamp, UUID)).replace(' ', '_')
+            self.crashes[crash_id] = {
+                'crash_id': crash_id, 'timestamp': timestamp,
+            }
+
+            self.assertEqual(
+                0,
+                self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+                    'crash', 'post', '-i', '-',
+                    stdin=json.dumps(self.crashes[crash_id]),
+                )
+            )
+
+        retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'crash', 'ls',
+        )
+        log.warning("setUp: crash ls returns %s" % retstr)
+
+        self.oldest_crashid = crash_id
+
+    def tearDown(self):
+        for crash in self.crashes.values():
+            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+                'crash', 'rm', crash['crash_id']
+            )
+
+    def test_info(self):
+        for crash in self.crashes.values():
+            log.warning('test_info: crash %s' % crash)
+            retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                'crash', 'ls'
+            )
+            log.warning('ls output: %s' % retstr)
+            retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                'crash', 'info', crash['crash_id'],
+            )
+            log.warning('crash info output: %s' % retstr)
+            crashinfo = json.loads(retstr)
+            self.assertIn('crash_id', crashinfo)
+            self.assertIn('timestamp', crashinfo)
+
+    def test_ls(self):
+        retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'crash', 'ls',
+        )
+        for crash in self.crashes.values():
+            self.assertIn(crash['crash_id'], retstr)
+
+    def test_rm(self):
+        crashid = next(iter(self.crashes.keys()))
+        self.assertEqual(
+            0,
+            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+                'crash', 'rm', crashid,
+            )
+        )
+
+        retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'crash', 'ls',
+        )
+        self.assertNotIn(crashid, retstr)
+
+    def test_stat(self):
+        retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'crash', 'stat',
+        )
+        self.assertIn('5 crashes recorded', retstr)
+        self.assertIn('4 older than 1 days old:', retstr)
+        self.assertIn('3 older than 3 days old:', retstr)
+        self.assertIn('1 older than 7 days old:', retstr)
+
+    def test_prune(self):
+        self.assertEqual(
+            0,
+            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+                'crash', 'prune', '5'
+            )
+        )
+        retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'crash', 'ls',
+        )
+        self.assertNotIn(self.oldest_crashid, retstr)
diff --git a/qa/tasks/mgr/test_dashboard.py b/qa/tasks/mgr/test_dashboard.py
new file mode 100644
index 000000000..c3459ec02
--- /dev/null
+++ b/qa/tasks/mgr/test_dashboard.py
@@ -0,0 +1,177 @@
+import logging
+import ssl
+
+import requests
+from requests.adapters import HTTPAdapter
+
+from .mgr_test_case import MgrTestCase
+
+log = logging.getLogger(__name__)
+
+
+class TestDashboard(MgrTestCase):
+    MGRS_REQUIRED = 3
+
+    def setUp(self):
+        super(TestDashboard, self).setUp()
+
+        self._assign_ports("dashboard", "ssl_server_port")
+        self._load_module("dashboard")
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("dashboard",
+                                                     "create-self-signed-cert")
+
+    def tearDown(self):
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("config", "set", "mgr",
+                                                     "mgr/dashboard/standby_behaviour",
+                                                     "redirect")
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("config", "set", "mgr",
+                                                     "mgr/dashboard/standby_error_status_code",
+                                                     "500")
+
+    def wait_until_webserver_available(self, url):
+        def _check_connection():
+            try:
+                requests.get(url, allow_redirects=False, verify=False)
+                return True
+            except requests.ConnectionError:
+                pass
+            return False
+        self.wait_until_true(_check_connection, timeout=30)
+
+    def test_standby(self):
+        # skip this test if mgr_standby_modules=false
+        if self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                "config", "get", "mgr", "mgr_standby_modules").strip() == "false":
+            log.info("Skipping test_standby since mgr_standby_modules=false")
+            return
+
+        original_active_id = self.mgr_cluster.get_active_id()
+        original_uri = self._get_uri("dashboard")
+        log.info("Originally running manager '{}' at {}".format(
+            original_active_id, original_uri))
+
+        # Force a failover and wait until the previously active manager
+        # is listed as standby.
+        self.mgr_cluster.mgr_fail(original_active_id)
+        self.wait_until_true(
+            lambda: original_active_id in self.mgr_cluster.get_standby_ids(),
+            timeout=30)
+
+        failed_active_id = self.mgr_cluster.get_active_id()
+        failed_over_uri = self._get_uri("dashboard")
+        log.info("After failover running manager '{}' at {}".format(
+            failed_active_id, failed_over_uri))
+
+        self.assertNotEqual(original_uri, failed_over_uri)
+
+        # Wait until web server of the standby node is settled.
+        self.wait_until_webserver_available(original_uri)
+
+        # The original active daemon should have come back up as a standby
+        # and be doing redirects to the new active daemon.
+        r = requests.get(original_uri, allow_redirects=False, verify=False)
+        self.assertEqual(r.status_code, 303)
+        self.assertEqual(r.headers['Location'], failed_over_uri)
+
+        # Ensure that every URL redirects to the active daemon.
+        r = requests.get("{}/runtime.js".format(original_uri.strip('/')),
+                         allow_redirects=False,
+                         verify=False)
+        self.assertEqual(r.status_code, 303)
+        self.assertEqual(r.headers['Location'], failed_over_uri)
+
+    def test_standby_disable_redirect(self):
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("config", "set", "mgr",
+                                                     "mgr/dashboard/standby_behaviour",
+                                                     "error")
+
+        original_active_id = self.mgr_cluster.get_active_id()
+        original_uri = self._get_uri("dashboard")
+        log.info("Originally running manager '{}' at {}".format(
+            original_active_id, original_uri))
+
+        # Force a failover and wait until the previously active manager
+        # is listed as standby.
+        self.mgr_cluster.mgr_fail(original_active_id)
+        self.wait_until_true(
+            lambda: original_active_id in self.mgr_cluster.get_standby_ids(),
+            timeout=30)
+
+        failed_active_id = self.mgr_cluster.get_active_id()
+        failed_over_uri = self._get_uri("dashboard")
+        log.info("After failover running manager '{}' at {}".format(
+            failed_active_id, failed_over_uri))
+
+        self.assertNotEqual(original_uri, failed_over_uri)
+
+        # Wait until web server of the standby node is settled.
+        self.wait_until_webserver_available(original_uri)
+
+        # Redirection should be disabled now, instead a 500 must be returned.
+        r = requests.get(original_uri, allow_redirects=False, verify=False)
+        self.assertEqual(r.status_code, 500)
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("config", "set", "mgr",
+                                                     "mgr/dashboard/standby_error_status_code",
+                                                     "503")
+
+        # The customized HTTP status code (503) must be returned.
+        r = requests.get(original_uri, allow_redirects=False, verify=False)
+        self.assertEqual(r.status_code, 503)
+
+    def test_urls(self):
+        base_uri = self._get_uri("dashboard")
+
+        # This is a very simple smoke test to check that the dashboard can
+        # give us a 200 response to requests.  We're not testing that
+        # the content is correct or even renders!
+
+        urls = [
+            "/",
+        ]
+
+        failures = []
+
+        for url in urls:
+            r = requests.get(base_uri + url, allow_redirects=False,
+                             verify=False)
+            if r.status_code >= 300 and r.status_code < 400:
+                log.error("Unexpected redirect to: {0} (from {1})".format(
+                    r.headers['Location'], base_uri))
+            if r.status_code != 200:
+                failures.append(url)
+
+            log.info("{0}: {1} ({2} bytes)".format(
+                url, r.status_code, len(r.content)
+            ))
+
+        self.assertListEqual(failures, [])
+
+    def test_tls(self):
+        class CustomHTTPAdapter(HTTPAdapter):
+            def __init__(self, ssl_version):
+                self.ssl_version = ssl_version
+                super().__init__()
+
+            def init_poolmanager(self, *args, **kwargs):
+                kwargs['ssl_version'] = self.ssl_version
+                return super().init_poolmanager(*args, **kwargs)
+
+        uri = self._get_uri("dashboard")
+
+        # TLSv1
+        with self.assertRaises(requests.exceptions.SSLError):
+            session = requests.Session()
+            session.mount(uri, CustomHTTPAdapter(ssl.PROTOCOL_TLSv1))
+            session.get(uri, allow_redirects=False, verify=False)
+
+        # TLSv1.1
+        with self.assertRaises(requests.exceptions.SSLError):
+            session = requests.Session()
+            session.mount(uri, CustomHTTPAdapter(ssl.PROTOCOL_TLSv1_1))
+            session.get(uri, allow_redirects=False, verify=False)
+
+        session = requests.Session()
+        session.mount(uri, CustomHTTPAdapter(ssl.PROTOCOL_TLS))
+        r = session.get(uri, allow_redirects=False, verify=False)
+        self.assertEqual(r.status_code, 200)
diff --git a/qa/tasks/mgr/test_failover.py b/qa/tasks/mgr/test_failover.py
new file mode 100644
index 000000000..a4e840883
--- /dev/null
+++ b/qa/tasks/mgr/test_failover.py
@@ -0,0 +1,148 @@
+
+import logging
+import json
+
+from .mgr_test_case import MgrTestCase
+
+
+log = logging.getLogger(__name__)
+
+
+class TestFailover(MgrTestCase):
+    MGRS_REQUIRED = 2
+
+    def setUp(self):
+        super(TestFailover, self).setUp()
+        self.setup_mgrs()
+
+    def test_timeout(self):
+        """
+        That when an active mgr stops responding, a standby is promoted
+        after mon_mgr_beacon_grace.
+        """
+
+        # Query which mgr is active
+        original_active = self.mgr_cluster.get_active_id()
+        original_standbys = self.mgr_cluster.get_standby_ids()
+
+        # Stop that daemon
+        self.mgr_cluster.mgr_stop(original_active)
+
+        # Assert that the other mgr becomes active
+        self.wait_until_true(
+            lambda: self.mgr_cluster.get_active_id() in original_standbys,
+            timeout=60
+        )
+
+        self.mgr_cluster.mgr_restart(original_active)
+        self.wait_until_true(
+            lambda: original_active in self.mgr_cluster.get_standby_ids(),
+            timeout=10
+        )
+
+    def test_timeout_nostandby(self):
+        """
+        That when an active mgr stop responding, and no standby is
+        available, the active mgr is removed from the map anyway.
+        """
+        # Query which mgr is active
+        original_active = self.mgr_cluster.get_active_id()
+        original_standbys = self.mgr_cluster.get_standby_ids()
+
+        for s in original_standbys:
+            self.mgr_cluster.mgr_stop(s)
+            self.mgr_cluster.mgr_fail(s)
+
+        self.assertListEqual(self.mgr_cluster.get_standby_ids(), [])
+        self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
+
+        grace = int(self.mgr_cluster.get_config("mon_mgr_beacon_grace"))
+        log.info("Should time out in about {0} seconds".format(grace))
+
+        self.mgr_cluster.mgr_stop(original_active)
+
+        # Now wait for the mon to notice the mgr is gone and remove it
+        # from the map.
+        self.wait_until_equal(
+            lambda: self.mgr_cluster.get_active_id(),
+            "",
+            timeout=grace * 2
+        )
+
+        self.assertListEqual(self.mgr_cluster.get_standby_ids(), [])
+        self.assertEqual(self.mgr_cluster.get_active_id(), "")
+
+    def test_explicit_fail(self):
+        """
+        That when a user explicitly fails a daemon, a standby immediately
+        replaces it.
+        :return:
+        """
+        # Query which mgr is active
+        original_active = self.mgr_cluster.get_active_id()
+        original_standbys = self.mgr_cluster.get_standby_ids()
+
+        self.mgr_cluster.mgr_fail(original_active)
+
+        # A standby should take over
+        self.wait_until_true(
+            lambda: self.mgr_cluster.get_active_id() in original_standbys,
+            timeout=60
+        )
+
+        # The one we failed should come back as a standby (he isn't
+        # really dead)
+        self.wait_until_true(
+            lambda: original_active in self.mgr_cluster.get_standby_ids(),
+            timeout=10
+        )
+
+        # Both daemons should have fully populated metadata
+        # (regression test for http://tracker.ceph.com/issues/21260)
+        meta = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            "mgr", "metadata"))
+        id_to_meta = dict([(i['name'], i) for i in meta])
+        for i in [original_active] + original_standbys:
+            self.assertIn(i, id_to_meta)
+            self.assertIn('ceph_version', id_to_meta[i])
+
+        # We should be able to fail back over again: the exercises
+        # our re-initialization of the python runtime within
+        # a single process lifetime.
+
+        # Get rid of any bystander standbys so that the original_active
+        # will be selected as next active.
+        new_active = self.mgr_cluster.get_active_id()
+        for daemon in original_standbys:
+            if daemon != new_active:
+                self.mgr_cluster.mgr_stop(daemon)
+                self.mgr_cluster.mgr_fail(daemon)
+
+        self.assertListEqual(self.mgr_cluster.get_standby_ids(),
+                             [original_active])
+
+        self.mgr_cluster.mgr_stop(new_active)
+        self.mgr_cluster.mgr_fail(new_active)
+
+        self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
+        self.assertEqual(self.mgr_cluster.get_standby_ids(), [])
+
+    def test_standby_timeout(self):
+        """
+        That when a standby daemon stops sending beacons, it is
+        removed from the list of standbys
+        :return:
+        """
+        original_active = self.mgr_cluster.get_active_id()
+        original_standbys = self.mgr_cluster.get_standby_ids()
+
+        victim = original_standbys[0]
+        self.mgr_cluster.mgr_stop(victim)
+
+        expect_standbys = set(original_standbys) - {victim}
+
+        self.wait_until_true(
+            lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
+            timeout=60
+        )
+        self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
diff --git a/qa/tasks/mgr/test_insights.py b/qa/tasks/mgr/test_insights.py
new file mode 100644
index 000000000..aa2548881
--- /dev/null
+++ b/qa/tasks/mgr/test_insights.py
@@ -0,0 +1,192 @@
+import logging
+import json
+import datetime
+import time
+
+from .mgr_test_case import MgrTestCase
+
+
+log = logging.getLogger(__name__)
+UUID = 'd5775432-0742-44a3-a435-45095e32e6b2'
+DATEFMT = '%Y-%m-%d %H:%M:%S.%f'
+
+class TestInsights(MgrTestCase):
+    def setUp(self):
+        super(TestInsights, self).setUp()
+        self.setup_mgrs()
+        self._load_module("insights")
+        self._load_module("selftest")
+        self.crash_ids = []
+
+    def tearDown(self):
+        self._clear_crashes()
+
+    def _insights(self):
+        retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd("insights")
+        return json.loads(retstr)
+
+    def _add_crash(self, hours, make_invalid = False):
+        now = datetime.datetime.utcnow()
+        timestamp = now - datetime.timedelta(hours = hours)
+        timestamp = timestamp.strftime(DATEFMT) + 'Z'
+        crash_id = '_'.join((timestamp, UUID)).replace(' ', '_')
+        crash = {
+            'crash_id': crash_id,
+            'timestamp': timestamp,
+        }
+        if make_invalid:
+            crash["timestamp"] = "not a timestamp"
+
+        ret = self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+            'crash', 'post', '-i', '-',
+            stdin=json.dumps(crash)
+        )
+        self.crash_ids.append(crash_id)
+        self.assertEqual(0, ret)
+
+    def _clear_crashes(self):
+        for crash_id in self.crash_ids:
+            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+                'crash', 'rm', crash_id
+            )
+
+    def _wait_for_health_history_checks(self, *args):
+        """Wait for a set of health checks to appear in the health history"""
+        timeout = datetime.datetime.utcnow() + \
+            datetime.timedelta(seconds = 15)
+        while True:
+            report = self._insights()
+            missing = False
+            for check in args:
+                if check not in report["health"]["history"]["checks"]:
+                    missing = True
+                    break
+            if not missing:
+                return
+            self.assertGreater(timeout,
+                    datetime.datetime.utcnow())
+            time.sleep(0.25)
+
+    def _wait_for_curr_health_cleared(self, check):
+        timeout = datetime.datetime.utcnow() + \
+            datetime.timedelta(seconds = 15)
+        while True:
+            report = self._insights()
+            if check not in report["health"]["current"]["checks"]:
+                return
+            self.assertGreater(timeout,
+                    datetime.datetime.utcnow())
+            time.sleep(0.25)
+
+    def test_health_history(self):
+        # use empty health history as starting point
+        self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+            "insights", "prune-health", "0")
+        report = self._insights()
+        self.assertFalse(report["health"]["history"]["checks"])
+
+        # generate health check history entries. we want to avoid the edge case
+        # of running these tests at _exactly_ the top of the hour so we can
+        # explicitly control when hourly work occurs. for this we use the
+        # current time offset to a half hour.
+        now = datetime.datetime.utcnow()
+        now = datetime.datetime(
+            year = now.year,
+            month = now.month,
+            day = now.day,
+            hour = now.hour,
+            minute = 30)
+
+        check_names = set()
+        for hours in [-18, -11, -5, -1, 0]:
+            # change the insight module's perception of "now" ...
+            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+                "mgr", "self-test", "insights_set_now_offset", str(hours))
+
+            # ... to simulate health check arrivals in the past
+            unique_check_name = "insights_health_check_{}".format(hours)
+            health_check = {
+                unique_check_name: {
+                    "severity": "warning",
+                    "summary": "summary",
+                    "detail": ["detail"]
+                }
+            }
+            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+                "mgr", "self-test", "health", "set",
+                json.dumps(health_check))
+
+            check_names.add(unique_check_name)
+
+            # and also set the same health check to test deduplication
+            dupe_check_name = "insights_health_check"
+            health_check = {
+                dupe_check_name: {
+                    "severity": "warning",
+                    "summary": "summary",
+                    "detail": ["detail"]
+                }
+            }
+            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+                "mgr", "self-test", "health", "set",
+                json.dumps(health_check))
+
+            check_names.add(dupe_check_name)
+
+            # wait for the health check to show up in the history report
+            self._wait_for_health_history_checks(unique_check_name, dupe_check_name)
+
+            # clear out the current health checks before moving on
+            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+                "mgr", "self-test", "health", "clear")
+            self._wait_for_curr_health_cleared(unique_check_name)
+
+        report = self._insights()
+        for check in check_names:
+            self.assertIn(check, report["health"]["history"]["checks"])
+
+        # restart the manager
+        active_id = self.mgr_cluster.get_active_id()
+        self.mgr_cluster.mgr_restart(active_id)
+
+        # pruning really removes history
+        self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+            "insights", "prune-health", "0")
+        report = self._insights()
+        self.assertFalse(report["health"]["history"]["checks"])
+
+    def test_schema(self):
+        """TODO: assert conformance to a full schema specification?"""
+        report = self._insights()
+        for key in ["osd_metadata",
+                    "pg_summary",
+                    "mon_status",
+                    "manager_map",
+                    "service_map",
+                    "mon_map",
+                    "crush_map",
+                    "fs_map",
+                    "osd_tree",
+                    "df",
+                    "osd_dump",
+                    "config",
+                    "health",
+                    "crashes",
+                    "version",
+                    "errors"]:
+            self.assertIn(key, report)
+
+    def test_crash_history(self):
+        self._clear_crashes()
+        report = self._insights()
+        self.assertFalse(report["crashes"]["summary"])
+        self.assertFalse(report["errors"])
+
+        # crashes show up in the report
+        self._add_crash(1)
+        report = self._insights()
+        self.assertTrue(report["crashes"]["summary"])
+        self.assertFalse(report["errors"])
+        log.warning("{}".format(json.dumps(report["crashes"], indent=2)))
+
+        self._clear_crashes()
diff --git a/qa/tasks/mgr/test_module_selftest.py b/qa/tasks/mgr/test_module_selftest.py
new file mode 100644
index 000000000..b054642db
--- /dev/null
+++ b/qa/tasks/mgr/test_module_selftest.py
@@ -0,0 +1,257 @@
+
+import time
+import requests
+import errno
+import logging
+
+from teuthology.exceptions import CommandFailedError
+
+from .mgr_test_case import MgrTestCase
+
+
+log = logging.getLogger(__name__)
+
+
+class TestModuleSelftest(MgrTestCase):
+    """
+    That modules with a self-test command can be loaded and execute it
+    without errors.
+
+    This is not a substitute for really testing the modules, but it
+    is quick and is designed to catch regressions that could occur
+    if data structures change in a way that breaks how the modules
+    touch them.
+    """
+    MGRS_REQUIRED = 1
+
+    def setUp(self):
+        super(TestModuleSelftest, self).setUp()
+        self.setup_mgrs()
+
+    def _selftest_plugin(self, module_name):
+        self._load_module("selftest")
+        self._load_module(module_name)
+
+        # Execute the module's self_test() method
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                "mgr", "self-test", "module", module_name)
+
+    def test_zabbix(self):
+        # Set these mandatory config fields so that the zabbix module
+        # won't trigger health/log errors on load/serve.
+        self.mgr_cluster.set_module_conf("zabbix", "zabbix_host", "localhost")
+        self.mgr_cluster.set_module_conf("zabbix", "identifier", "foo")
+        self._selftest_plugin("zabbix")
+
+    def test_prometheus(self):
+        self._assign_ports("prometheus", "server_port", min_port=8100)
+        self._selftest_plugin("prometheus")
+
+    def test_influx(self):
+        self._selftest_plugin("influx")
+
+    def test_diskprediction_local(self):
+        self._load_module("selftest")
+        python_version = self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            "mgr", "self-test", "python-version")
+        if tuple(int(v) for v in python_version.split('.')) >= (3, 8):
+            # https://tracker.ceph.com/issues/45147
+            self.skipTest(f'python {python_version} not compatible with '
+                          'diskprediction_local')
+        self._selftest_plugin("diskprediction_local")
+
+    def test_telegraf(self):
+        self._selftest_plugin("telegraf")
+
+    def test_iostat(self):
+        self._selftest_plugin("iostat")
+
+    def test_devicehealth(self):
+        self._selftest_plugin("devicehealth")
+        # Clean up the pool that the module creates, because otherwise
+        # it's low PG count causes test failures.
+        pool_name = "device_health_metrics"
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                "osd", "pool", "delete", pool_name, pool_name,
+                "--yes-i-really-really-mean-it")
+
+    def test_selftest_run(self):
+        self._load_module("selftest")
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test", "run")
+
+    def test_telemetry(self):
+        self._selftest_plugin("telemetry")
+
+    def test_crash(self):
+        self._selftest_plugin("crash")
+
+    def test_orchestrator(self):
+        self._selftest_plugin("orchestrator")
+
+
+    def test_selftest_config_update(self):
+        """
+        That configuration updates are seen by running mgr modules
+        """
+        self._load_module("selftest")
+
+        def get_value():
+            return self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                "mgr", "self-test", "config", "get", "testkey").strip()
+
+        self.assertEqual(get_value(), "None")
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            "config", "set", "mgr", "mgr/selftest/testkey", "foo")
+        self.wait_until_equal(get_value, "foo", timeout=10)
+
+        def get_localized_value():
+            return self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                "mgr", "self-test", "config", "get_localized", "testkey").strip()
+
+        self.assertEqual(get_localized_value(), "foo")
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            "config", "set", "mgr", "mgr/selftest/{}/testkey".format(
+                self.mgr_cluster.get_active_id()),
+            "bar")
+        self.wait_until_equal(get_localized_value, "bar", timeout=10)
+
+
+    def test_selftest_command_spam(self):
+        # Use the selftest module to stress the mgr daemon
+        self._load_module("selftest")
+
+        # Use the dashboard to test that the mgr is still able to do its job
+        self._assign_ports("dashboard", "ssl_server_port")
+        self._load_module("dashboard")
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("dashboard",
+                                                     "create-self-signed-cert")
+
+        original_active = self.mgr_cluster.get_active_id()
+        original_standbys = self.mgr_cluster.get_standby_ids()
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test",
+                                                     "background", "start",
+                                                     "command_spam")
+
+        dashboard_uri = self._get_uri("dashboard")
+
+        delay = 10
+        periods = 10
+        for i in range(0, periods):
+            t1 = time.time()
+            # Check that an HTTP module remains responsive
+            r = requests.get(dashboard_uri, verify=False)
+            self.assertEqual(r.status_code, 200)
+
+            # Check that a native non-module command remains responsive
+            self.mgr_cluster.mon_manager.raw_cluster_cmd("osd", "df")
+
+            time.sleep(delay - (time.time() - t1))
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test",
+                                                     "background", "stop")
+
+        # Check that all mgr daemons are still running
+        self.assertEqual(original_active, self.mgr_cluster.get_active_id())
+        self.assertEqual(original_standbys, self.mgr_cluster.get_standby_ids())
+
+    def test_module_commands(self):
+        """
+        That module-handled commands have appropriate  behavior on
+        disabled/failed/recently-enabled modules.
+        """
+
+        # Calling a command on a disabled module should return the proper
+        # error code.
+        self._load_module("selftest")
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            "mgr", "module", "disable", "selftest")
+        with self.assertRaises(CommandFailedError) as exc_raised:
+            self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                "mgr", "self-test", "run")
+
+        self.assertEqual(exc_raised.exception.exitstatus, errno.EOPNOTSUPP)
+
+        # Calling a command that really doesn't exist should give me EINVAL.
+        with self.assertRaises(CommandFailedError) as exc_raised:
+            self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                "osd", "albatross")
+
+        self.assertEqual(exc_raised.exception.exitstatus, errno.EINVAL)
+
+        # Enabling a module and then immediately using ones of its commands
+        # should work (#21683)
+        self._load_module("selftest")
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            "mgr", "self-test", "config", "get", "testkey")
+
+        # Calling a command for a failed module should return the proper
+        # error code.
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            "mgr", "self-test", "background", "start", "throw_exception")
+        with self.assertRaises(CommandFailedError) as exc_raised:
+            self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                "mgr", "self-test", "run"
+            )
+        self.assertEqual(exc_raised.exception.exitstatus, errno.EIO)
+
+        # A health alert should be raised for a module that has thrown
+        # an exception from its serve() method
+        self.wait_for_health(
+            "Module 'selftest' has failed: Synthetic exception in serve",
+            timeout=30)
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            "mgr", "module", "disable", "selftest")
+
+        self.wait_for_health_clear(timeout=30)
+
+    def test_module_remote(self):
+        """
+        Use the selftest module to exercise inter-module communication
+        """
+        self._load_module("selftest")
+        # The "self-test remote" operation just happens to call into
+        # influx.
+        self._load_module("influx")
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            "mgr", "self-test", "remote")
+
+    def test_selftest_cluster_log(self):
+        """
+        Use the selftest module to test the cluster/audit log interface.
+        """
+        priority_map = {
+            "info": "INF",
+            "security": "SEC",
+            "warning": "WRN",
+            "error": "ERR"
+        }
+        self._load_module("selftest")
+        for priority in priority_map.keys():
+            message = "foo bar {}".format(priority)
+            log_message = "[{}] {}".format(priority_map[priority], message)
+            # Check for cluster/audit logs:
+            # 2018-09-24 09:37:10.977858 mgr.x [INF] foo bar info
+            # 2018-09-24 09:37:10.977860 mgr.x [SEC] foo bar security
+            # 2018-09-24 09:37:10.977863 mgr.x [WRN] foo bar warning
+            # 2018-09-24 09:37:10.977866 mgr.x [ERR] foo bar error
+            with self.assert_cluster_log(log_message):
+                self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                    "mgr", "self-test", "cluster-log", "cluster",
+                    priority, message)
+            with self.assert_cluster_log(log_message, watch_channel="audit"):
+                self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                    "mgr", "self-test", "cluster-log", "audit",
+                    priority, message)
+
+    def test_selftest_cluster_log_unknown_channel(self):
+        """
+        Use the selftest module to test the cluster/audit log interface.
+        """
+        with self.assertRaises(CommandFailedError) as exc_raised:
+            self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                "mgr", "self-test", "cluster-log", "xyz",
+                "ERR", "The channel does not exist")
+        self.assertEqual(exc_raised.exception.exitstatus, errno.EOPNOTSUPP)
diff --git a/qa/tasks/mgr/test_orchestrator_cli.py b/qa/tasks/mgr/test_orchestrator_cli.py
new file mode 100644
index 000000000..3fccef9a6
--- /dev/null
+++ b/qa/tasks/mgr/test_orchestrator_cli.py
@@ -0,0 +1,250 @@
+import errno
+import json
+import logging
+
+
+from .mgr_test_case import MgrTestCase
+
+
+log = logging.getLogger(__name__)
+
+
+class TestOrchestratorCli(MgrTestCase):
+    MGRS_REQUIRED = 1
+
+    def _cmd(self, module, *args):
+        return self.mgr_cluster.mon_manager.raw_cluster_cmd(module, *args)
+
+    def _orch_cmd(self, *args):
+        return self._cmd("orch", *args)
+
+    def _progress_cmd(self, *args):
+        return self.mgr_cluster.mon_manager.raw_cluster_cmd("progress", *args)
+
+    def _orch_cmd_result(self, *args, **kwargs):
+        """
+        raw_cluster_cmd doesn't support kwargs.
+        """
+        return self.mgr_cluster.mon_manager.raw_cluster_cmd_result("orch", *args, **kwargs)
+
+    def _test_orchestrator_cmd_result(self, *args, **kwargs):
+        return self.mgr_cluster.mon_manager.raw_cluster_cmd_result("test_orchestrator", *args, **kwargs)
+
+    def setUp(self):
+        super(TestOrchestratorCli, self).setUp()
+
+        self._load_module("orchestrator")
+        self._load_module("test_orchestrator")
+        self._orch_cmd("set", "backend", "test_orchestrator")
+
+    def test_status(self):
+        ret = self._orch_cmd("status")
+        self.assertIn("test_orchestrator", ret)
+
+    def test_device_ls(self):
+        ret = self._orch_cmd("device", "ls")
+        self.assertIn("localhost", ret)
+
+    def test_device_ls_refresh(self):
+        ret = self._orch_cmd("device", "ls", "--refresh")
+        self.assertIn("localhost", ret)
+
+    def test_device_ls_hoshs(self):
+        ret = self._orch_cmd("device", "ls", "localhost", "host1")
+        self.assertIn("localhost", ret)
+
+
+    def test_device_ls_json(self):
+        ret = self._orch_cmd("device", "ls", "--format", "json")
+        self.assertIn("localhost", ret)
+        self.assertIsInstance(json.loads(ret), list)
+
+    def test_ps(self):
+        ret = self._orch_cmd("ps")
+        self.assertIn("mgr", ret)
+
+    def test_ps_json(self):
+        ret = self._orch_cmd("ps", "--format", "json")
+        self.assertIsInstance(json.loads(ret), list)
+        self.assertIn("mgr", ret)
+
+
+    def test_service_action(self):
+        self._orch_cmd("restart", "mds.cephfs")
+        self._orch_cmd("stop", "mds.cephfs")
+        self._orch_cmd("start", "mds.cephfs")
+
+    def test_service_instance_action(self):
+        self._orch_cmd("daemon", "restart", "mds.a")
+        self._orch_cmd("daemon", "stop", "mds.a")
+        self._orch_cmd("daemon", "start", "mds.a")
+
+    def test_osd_create(self):
+        drive_group = """
+service_type: osd
+service_id: any.sda
+placement:
+  host_pattern: '*'
+data_devices:
+  all: True
+"""
+        res = self._orch_cmd_result("apply", "osd", "-i", "-",
+                                    stdin=drive_group)
+        self.assertEqual(res, 0)
+
+    def test_blink_device_light(self):
+        def _ls_lights(what):
+            return json.loads(self._cmd("device", "ls-lights"))[what]
+
+        metadata = json.loads(self._cmd("osd", "metadata"))
+        dev_name_ids = [osd["device_ids"] for osd in metadata]
+        _, dev_id = [d.split('=') for d in dev_name_ids if len(d.split('=')) == 2][0]
+
+        for t in ["ident", "fault"]:
+            self.assertNotIn(dev_id, _ls_lights(t))
+            self._cmd("device", "light", "on", dev_id, t)
+            self.assertIn(dev_id, _ls_lights(t))
+
+            health = {
+                'ident': 'DEVICE_IDENT_ON',
+                'fault': 'DEVICE_FAULT_ON',
+            }[t]
+            self.wait_for_health(health, 30)
+
+            self._cmd("device", "light", "off", dev_id, t)
+            self.assertNotIn(dev_id, _ls_lights(t))
+
+        self.wait_for_health_clear(30)
+
+    def test_mds_add(self):
+        self._orch_cmd('daemon', 'add', 'mds', 'fsname')
+
+    def test_rgw_add(self):
+        self._orch_cmd('daemon', 'add', 'rgw', 'realm', 'zone')
+
+    def test_nfs_add(self):
+        self._orch_cmd('daemon', 'add', "nfs", "service_name")
+
+    def test_osd_rm(self):
+        self._orch_cmd('daemon', "rm", "osd.0", '--force')
+
+    def test_mds_rm(self):
+        self._orch_cmd("daemon", "rm", "mds.fsname")
+
+    def test_rgw_rm(self):
+        self._orch_cmd("daemon", "rm", "rgw.myrealm.myzone")
+
+    def test_nfs_rm(self):
+        self._orch_cmd("daemon", "rm", "nfs.service_name")
+
+    def test_host_ls(self):
+        out = self._orch_cmd("host", "ls", "--format=json")
+        hosts = json.loads(out)
+        self.assertEqual(len(hosts), 1)
+        self.assertEqual(hosts[0]["hostname"], "localhost")
+
+    def test_host_add(self):
+        self._orch_cmd("host", "add", "hostname")
+
+    def test_host_rm(self):
+        self._orch_cmd("host", "rm", "hostname")
+
+    def test_mon_update(self):
+        self._orch_cmd("apply", "mon", "3 host1:1.2.3.0/24 host2:1.2.3.0/24 host3:10.0.0.0/8")
+        self._orch_cmd("apply", "mon", "3 host1:1.2.3.4 host2:1.2.3.4 host3:10.0.0.1")
+
+    def test_mgr_update(self):
+        self._orch_cmd("apply", "mgr", "3")
+
+    def test_nfs_update(self):
+        self._orch_cmd("apply", "nfs", "service_name", "2")
+
+    def test_error(self):
+        ret = self._orch_cmd_result("host", "add", "raise_validation_error")
+        self.assertEqual(ret, errno.EINVAL)
+        ret = self._orch_cmd_result("host", "add", "raise_error")
+        self.assertEqual(ret, errno.EINVAL)
+        ret = self._orch_cmd_result("host", "add", "raise_bug")
+        self.assertEqual(ret, errno.EINVAL)
+        ret = self._orch_cmd_result("host", "add", "raise_not_implemented")
+        self.assertEqual(ret, errno.ENOENT)
+        ret = self._orch_cmd_result("host", "add", "raise_no_orchestrator")
+        self.assertEqual(ret, errno.ENOENT)
+        ret = self._orch_cmd_result("host", "add", "raise_import_error")
+        self.assertEqual(ret, errno.ENOENT)
+
+    def test_load_data(self):
+        data = {
+            'inventory': [
+                {
+                    'name': 'host0',
+                    'devices': [
+                        {
+                            'type': 'hdd',
+                            'id': '/dev/sda',
+                            'size': 1024**4 * 4,
+                            'rotates': True
+                        }
+                    ]
+                },
+                {
+                    'name': 'host1',
+                    'devices': [
+                        {
+                            'type': 'hdd',
+                            'id': '/dev/sda',
+                            'size': 1024**4 * 4,
+                            'rotates': True
+                        }
+                    ]
+                }
+            ],
+            'daemons': [
+                {
+                    'hostname': 'host0',
+                    'daemon_type': 'mon',
+                    'daemon_id': 'a'
+                },
+                {
+                    'hostname': 'host1',
+                    'daemon_type': 'osd',
+                    'daemon_id': '1'
+                }
+            ]
+        }
+
+        ret = self._test_orchestrator_cmd_result('load_data', '-i', '-', stdin=json.dumps(data))
+        self.assertEqual(ret, 0)
+        out = self._orch_cmd('device', 'ls', '--format=json')
+        inventory = data['inventory']
+        inventory_result = json.loads(out)
+        self.assertEqual(len(inventory), len(inventory_result))
+
+        out = self._orch_cmd('device', 'ls', 'host0', '--format=json')
+        inventory_result = json.loads(out)
+        self.assertEqual(len(inventory_result), 1)
+        self.assertEqual(inventory_result[0]['name'], 'host0')
+
+        out = self._orch_cmd('ps', '--format=json')
+        daemons = data['daemons']
+        daemons_result = json.loads(out)
+        self.assertEqual(len(daemons), len(daemons_result))
+
+        out = self._orch_cmd('ps', 'host0', '--format=json')
+        daemons_result = json.loads(out)
+        self.assertEqual(len(daemons_result), 1)
+        self.assertEqual(daemons_result[0]['hostname'], 'host0')
+
+        # test invalid input file: invalid json
+        json_str = '{ "inventory: '
+        ret = self._test_orchestrator_cmd_result('load_data', '-i', '-', stdin=json_str)
+        self.assertEqual(ret, errno.EINVAL)
+
+        # test invalid input file: missing key
+        json_str = '{ "inventory": [{"devices": []}] }'
+        ret = self._test_orchestrator_cmd_result('load_data', '-i', '-', stdin=json_str)
+        self.assertEqual(ret, errno.EINVAL)
+
+        # load empty data for other tests
+        ret = self._test_orchestrator_cmd_result('load_data', '-i', '-', stdin='{}')
+        self.assertEqual(ret, 0)
diff --git a/qa/tasks/mgr/test_progress.py b/qa/tasks/mgr/test_progress.py
new file mode 100644
index 000000000..082653f62
--- /dev/null
+++ b/qa/tasks/mgr/test_progress.py
@@ -0,0 +1,380 @@
+
+import json
+import logging
+import time
+from .mgr_test_case import MgrTestCase
+from contextlib import contextmanager
+
+log = logging.getLogger(__name__)
+
+
+class TestProgress(MgrTestCase):
+    POOL = "progress_data"
+
+    # How long we expect to wait at most between taking an OSD out
+    # and seeing the progress event pop up.
+    EVENT_CREATION_PERIOD = 60
+
+    WRITE_PERIOD = 30
+
+    # Generous period for OSD recovery, should be same order of magnitude
+    # to how long it took to write the data to begin with
+    RECOVERY_PERIOD = WRITE_PERIOD * 4
+
+    def _get_progress(self):
+        out = self.mgr_cluster.mon_manager.raw_cluster_cmd("progress", "json")
+        return json.loads(out)
+
+    def _all_events(self):
+        """
+        To avoid racing on completion, we almost always want to look
+        for events in the total list of active and complete, so
+        munge them into a single list.
+        """
+        p = self._get_progress()
+        log.info(json.dumps(p, indent=2))
+        return p['events'] + p['completed']
+
+    def _events_in_progress(self):
+        """
+        this function returns all events that are in progress
+        """
+        p = self._get_progress()
+        log.info(json.dumps(p, indent=2))
+        return p['events']
+
+    def _completed_events(self):
+        """
+        This function returns all events that are completed
+        """
+        p = self._get_progress()
+        log.info(json.dumps(p, indent=2))
+        return p['completed']
+
+    def is_osd_marked_out(self, ev):
+        return ev['message'].endswith('marked out')
+
+    def is_osd_marked_in(self, ev):
+        return ev['message'].endswith('marked in')
+
+    def _get_osd_in_out_events(self, marked='both'):
+        """
+        Return the event that deals with OSDs being
+        marked in, out or both
+        """
+
+        marked_in_events = []
+        marked_out_events = []
+
+        events_in_progress = self._events_in_progress()
+        for ev in events_in_progress:
+            if self.is_osd_marked_out(ev):
+                marked_out_events.append(ev)
+            elif self.is_osd_marked_in(ev):
+                marked_in_events.append(ev)
+
+        if marked == 'both':
+            return [marked_in_events] + [marked_out_events]
+        elif marked == 'in':
+            return marked_in_events
+        else:
+            return marked_out_events
+
+    def _osd_in_out_events_count(self, marked='both'):
+        """
+        Count the number of on going recovery events that deals with
+        OSDs being marked in, out or both.
+        """
+        events_in_progress = self._events_in_progress()
+        marked_in_count = 0
+        marked_out_count = 0
+
+        for ev in events_in_progress:
+            if self.is_osd_marked_out(ev):
+                marked_out_count += 1
+            elif self.is_osd_marked_in(ev):
+                marked_in_count += 1
+
+        if marked == 'both':
+            return marked_in_count + marked_out_count
+        elif marked == 'in':
+            return marked_in_count
+        else:
+            return marked_out_count
+
+    def _setup_pool(self, size=None):
+        self.mgr_cluster.mon_manager.create_pool(self.POOL)
+        if size is not None:
+            self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                'osd', 'pool', 'set', self.POOL, 'size', str(size))
+
+    def _osd_in_out_completed_events_count(self, marked='both'):
+        """
+        Count the number of completed recovery events that deals with
+        OSDs being marked in, out, or both.
+        """
+
+        completed_events = self._completed_events()
+        marked_in_count = 0
+        marked_out_count = 0
+
+        for ev in completed_events:
+            if self.is_osd_marked_out(ev):
+                marked_out_count += 1
+            elif self.is_osd_marked_in(ev):
+                marked_in_count += 1
+
+        if marked == 'both':
+            return marked_in_count + marked_out_count
+        elif marked == 'in':
+            return marked_in_count
+        else:
+            return marked_out_count
+
+    def _write_some_data(self, t):
+        """
+        To adapt to test systems of varying performance, we write
+        data for a defined time period, rather than to a defined
+        capacity.  This will hopefully result in a similar timescale
+        for PG recovery after an OSD failure.
+        """
+
+        args = [
+            "rados", "-p", self.POOL, "bench", str(t), "write", "-t", "16"]
+
+        self.mgr_cluster.admin_remote.run(args=args, wait=True)
+
+    def _osd_count(self):
+        osd_map = self.mgr_cluster.mon_manager.get_osd_dump_json()
+        return len(osd_map['osds'])
+
+    @contextmanager    
+    def recovery_backfill_disabled(self):
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'osd', 'set', 'nobackfill')
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'osd', 'set', 'norecover')
+        yield
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'osd', 'unset', 'nobackfill')
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'osd', 'unset', 'norecover')
+           
+    def setUp(self):
+        super(TestProgress, self).setUp()
+        # Ensure we have at least four OSDs
+        if self._osd_count() < 4:
+            self.skipTest("Not enough OSDS!")
+
+        # Remove any filesystems so that we can remove their pools
+        if self.mds_cluster:
+            self.mds_cluster.mds_stop()
+            self.mds_cluster.mds_fail()
+            self.mds_cluster.delete_all_filesystems()
+
+        # Remove all other pools
+        for pool in self.mgr_cluster.mon_manager.get_osd_dump_json()['pools']:
+            self.mgr_cluster.mon_manager.remove_pool(pool['pool_name'])
+
+        self._load_module("progress")
+        self.mgr_cluster.mon_manager.raw_cluster_cmd('progress', 'clear')
+
+    def _simulate_failure(self, osd_ids=None):
+        """
+        Common lead-in to several tests: get some data in the cluster,
+        then mark an OSD out to trigger the start of a progress event.
+
+        Return the JSON representation of the failure event.
+        """
+
+        if osd_ids is None:
+            osd_ids = [0]
+
+        self._setup_pool()
+        self._write_some_data(self.WRITE_PERIOD)
+        with self.recovery_backfill_disabled():
+            for osd_id in osd_ids:
+                self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                    'osd', 'out', str(osd_id))
+
+            # Wait for a progress event to pop up
+            self.wait_until_equal(lambda: self._osd_in_out_events_count('out'), 1,
+                                  timeout=self.EVENT_CREATION_PERIOD,
+                                  period=1)
+
+        ev = self._get_osd_in_out_events('out')[0]
+        log.info(json.dumps(ev, indent=1))
+        self.assertIn("Rebalancing after osd.0 marked out", ev['message'])
+        return ev
+
+    def _simulate_back_in(self, osd_ids, initial_event):
+        for osd_id in osd_ids:
+            self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                    'osd', 'in', str(osd_id))
+
+        # First Event should complete promptly
+        self.wait_until_true(lambda: self._is_complete(initial_event['id']),
+                             timeout=self.RECOVERY_PERIOD)
+
+        with self.recovery_backfill_disabled():
+
+            try:
+                # Wait for progress event marked in to pop up
+                self.wait_until_equal(lambda: self._osd_in_out_events_count('in'), 1,
+                                      timeout=self.EVENT_CREATION_PERIOD,
+                                      period=1)
+            except RuntimeError as ex:
+                if not "Timed out after" in str(ex):
+                    raise ex
+
+                log.info("There was no PGs affected by osd being marked in")
+                return None
+
+            new_event = self._get_osd_in_out_events('in')[0]
+        return new_event
+
+    def _no_events_anywhere(self):
+        """
+        Whether there are any live or completed events
+        """
+        p = self._get_progress()
+        total_events = len(p['events']) + len(p['completed'])
+        return total_events == 0
+
+    def _is_quiet(self):
+        """
+        Whether any progress events are live.
+        """
+        return len(self._get_progress()['events']) == 0
+
+    def _is_complete(self, ev_id):
+        progress = self._get_progress()
+        live_ids = [ev['id'] for ev in progress['events']]
+        complete_ids = [ev['id'] for ev in progress['completed']]
+        if ev_id in complete_ids:
+            assert ev_id not in live_ids
+            return True
+        else:
+            assert ev_id in live_ids
+            return False
+
+    def _is_inprogress_or_complete(self, ev_id):
+        for ev in self._events_in_progress():
+            if ev['id'] == ev_id:
+                return ev['progress'] > 0
+        # check if the event completed
+        return self._is_complete(ev_id)
+
+    def tearDown(self):
+        if self.POOL in self.mgr_cluster.mon_manager.pools:
+            self.mgr_cluster.mon_manager.remove_pool(self.POOL)
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'osd', 'unset', 'nobackfill')
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'osd', 'unset', 'norecover')
+
+        osd_map = self.mgr_cluster.mon_manager.get_osd_dump_json()
+        for osd in osd_map['osds']:
+            if osd['weight'] == 0.0:
+                self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                    'osd', 'in', str(osd['osd']))
+
+        super(TestProgress, self).tearDown()
+
+    def test_osd_healthy_recovery(self):
+        """
+        The simple recovery case: an OSD goes down, its PGs get a new
+        placement, and we wait for the PG to get healthy in its new
+        locations.
+        """
+        ev = self._simulate_failure()
+
+        # Wait for progress event to ultimately reach completion
+        self.wait_until_true(lambda: self._is_complete(ev['id']),
+                             timeout=self.RECOVERY_PERIOD)
+        self.assertEqual(self._osd_in_out_events_count(), 0)
+
+    def test_pool_removal(self):
+        """
+        That a pool removed during OSD recovery causes the
+        progress event to be correctly marked complete once there
+        is no more data to move.
+        """
+        ev = self._simulate_failure()
+
+        self.mgr_cluster.mon_manager.remove_pool(self.POOL)
+
+        # Event should complete promptly
+        self.wait_until_true(lambda: self._is_complete(ev['id']),
+                             timeout=self.RECOVERY_PERIOD)
+        self.assertEqual(self._osd_in_out_events_count(), 0)
+
+    def test_osd_came_back(self):
+        """
+        When a recovery is underway, but then the out OSD
+        comes back in, such that recovery is no longer necessary.
+        It should create another event for when osd is marked in
+        and cancel the one that is still ongoing.
+        """
+        ev1 = self._simulate_failure()
+
+        ev2 = self._simulate_back_in([0], ev1)
+
+        if ev2 is not None:
+            # Wait for progress event to ultimately complete
+            self.wait_until_true(lambda: self._is_complete(ev2['id']),
+                                 timeout=self.RECOVERY_PERIOD)
+
+        self.assertEqual(self._osd_in_out_events_count(), 0)
+
+    def test_turn_off_module(self):
+        """
+        When the the module is turned off, there should not
+        be any on going events or completed events.
+        Also module should not accept any kind of Remote Event
+        coming in from other module, however, once it is turned
+        back, on creating an event should be working as it is.
+        """
+
+        pool_size = 3
+        self._setup_pool(size=pool_size)
+        self._write_some_data(self.WRITE_PERIOD)
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("progress", "off")
+
+        with self.recovery_backfill_disabled():
+            self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                    'osd', 'out', '0')
+
+        time.sleep(self.EVENT_CREATION_PERIOD/2)
+
+        with self.recovery_backfill_disabled():
+            self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                    'osd', 'in', '0')
+
+        time.sleep(self.EVENT_CREATION_PERIOD/2)
+
+        self.assertTrue(self._no_events_anywhere())
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("progress", "on")
+
+        self._write_some_data(self.WRITE_PERIOD)
+
+        with self.recovery_backfill_disabled():
+
+            self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                    'osd', 'out', '0')
+
+            # Wait for a progress event to pop up
+            self.wait_until_equal(lambda: self._osd_in_out_events_count('out'), 1,
+                                  timeout=self.EVENT_CREATION_PERIOD,
+                                  period=1)
+
+        ev1 = self._get_osd_in_out_events('out')[0]
+
+        log.info(json.dumps(ev1, indent=1))
+
+        self.wait_until_true(lambda: self._is_complete(ev1['id']),
+                             check_fn=lambda: self._is_inprogress_or_complete(ev1['id']),
+                             timeout=self.RECOVERY_PERIOD)
+        self.assertTrue(self._is_quiet())
diff --git a/qa/tasks/mgr/test_prometheus.py b/qa/tasks/mgr/test_prometheus.py
new file mode 100644
index 000000000..376556ab3
--- /dev/null
+++ b/qa/tasks/mgr/test_prometheus.py
@@ -0,0 +1,79 @@
+import json
+import logging
+import requests
+
+from .mgr_test_case import MgrTestCase
+
+log = logging.getLogger(__name__)
+
+
+class TestPrometheus(MgrTestCase):
+    MGRS_REQUIRED = 3
+
+    def setUp(self):
+        super(TestPrometheus, self).setUp()
+        self.setup_mgrs()
+
+    def test_file_sd_command(self):
+        self._assign_ports("prometheus", "server_port")
+        self._load_module("prometheus")
+
+        result = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            "prometheus", "file_sd_config"))
+        mgr_map = self.mgr_cluster.get_mgr_map()
+        self.assertEqual(len(result[0]['targets']), len(mgr_map['standbys']) + 1)
+
+
+
+    def test_standby(self):
+        self._assign_ports("prometheus", "server_port")
+        self._load_module("prometheus")
+
+        original_active = self.mgr_cluster.get_active_id()
+
+        original_uri = self._get_uri("prometheus")
+        log.info("Originally running at {0}".format(original_uri))
+
+        self.mgr_cluster.mgr_fail(original_active)
+
+        failed_over_uri = self._get_uri("prometheus")
+        log.info("After failover running at {0}".format(failed_over_uri))
+
+        self.assertNotEqual(original_uri, failed_over_uri)
+
+        # The original active daemon should have come back up as a standby
+        # and serve some html under "/" and an empty answer under /metrics
+        r = requests.get(original_uri, allow_redirects=False)
+        self.assertEqual(r.status_code, 200)
+        r = requests.get(original_uri + "metrics", allow_redirects=False)
+        self.assertEqual(r.status_code, 200)
+        self.assertEqual(r.headers["content-type"], "text/plain;charset=utf-8")
+        self.assertEqual(r.headers["server"], "Ceph-Prometheus")
+
+    def test_urls(self):
+        self._assign_ports("prometheus", "server_port")
+        self._load_module("prometheus")
+
+        base_uri = self._get_uri("prometheus")
+
+        # This is a very simple smoke test to check that the module can
+        # give us a 200 response to requests.  We're not testing that
+        # the content is correct or even renders!
+
+        urls = [
+            "/",
+            "/metrics"
+        ]
+
+        failures = []
+
+        for url in urls:
+            r = requests.get(base_uri + url, allow_redirects=False)
+            if r.status_code != 200:
+                failures.append(url)
+
+            log.info("{0}: {1} ({2} bytes)".format(
+                url, r.status_code, len(r.content)
+            ))
+
+        self.assertListEqual(failures, [])
diff --git a/qa/tasks/mon_clock_skew_check.py b/qa/tasks/mon_clock_skew_check.py
new file mode 100644
index 000000000..59d4169d1
--- /dev/null
+++ b/qa/tasks/mon_clock_skew_check.py
@@ -0,0 +1,73 @@
+"""
+Handle clock skews in monitors.
+"""
+import logging
+import time
+from tasks import ceph_manager
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+class ClockSkewCheck:
+    """
+    Check if there are any clock skews among the monitors in the
+    quorum.
+
+    This task accepts the following options:
+
+    interval     amount of seconds to wait before check. (default: 30.0)
+    expect-skew  'true' or 'false', to indicate whether to expect a skew during
+                 the run or not. If 'true', the test will fail if no skew is
+                 found, and succeed if a skew is indeed found; if 'false', it's
+                 the other way around. (default: false)
+
+    - mon_clock_skew_check:
+        expect-skew: true
+    """
+
+    def __init__(self, ctx, manager, config, logger):
+        self.ctx = ctx
+        self.manager = manager
+
+        self.stopping = False
+        self.logger = logger
+        self.config = config
+
+        if self.config is None:
+            self.config = dict()
+
+
+def task(ctx, config):
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'mon_clock_skew_check task only accepts a dict for configuration'
+    interval = float(config.get('interval', 30.0))
+    expect_skew = config.get('expect-skew', False)
+
+    log.info('Beginning mon_clock_skew_check...')
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    quorum_size = len(teuthology.get_mon_names(ctx))
+    manager.wait_for_mon_quorum_size(quorum_size)
+
+    # wait a bit
+    log.info('sleeping for {s} seconds'.format(
+        s=interval))
+    time.sleep(interval)
+
+    health = manager.get_mon_health(True)
+    log.info('got health %s' % health)
+    if expect_skew:
+        if 'MON_CLOCK_SKEW' not in health['checks']:
+            raise RuntimeError('expected MON_CLOCK_SKEW but got none')
+    else:
+        if 'MON_CLOCK_SKEW' in health['checks']:
+            raise RuntimeError('got MON_CLOCK_SKEW but expected none')
+
diff --git a/qa/tasks/mon_recovery.py b/qa/tasks/mon_recovery.py
new file mode 100644
index 000000000..fa7aa1a8d
--- /dev/null
+++ b/qa/tasks/mon_recovery.py
@@ -0,0 +1,80 @@
+"""
+Monitor recovery
+"""
+import logging
+from tasks import ceph_manager
+from teuthology import misc as teuthology
+
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Test monitor recovery.
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'task only accepts a dict for configuration'
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    mons = [f.split('.')[1] for f in teuthology.get_mon_names(ctx)]
+    log.info("mon ids = %s" % mons)
+
+    manager.wait_for_mon_quorum_size(len(mons))
+
+    log.info('verifying all monitors are in the quorum')
+    for m in mons:
+        s = manager.get_mon_status(m)
+        assert s['state'] == 'leader' or s['state'] == 'peon'
+        assert len(s['quorum']) == len(mons)
+
+    log.info('restarting each monitor in turn')
+    for m in mons:
+        # stop a monitor
+        manager.kill_mon(m)
+        manager.wait_for_mon_quorum_size(len(mons) - 1)
+
+        # restart
+        manager.revive_mon(m)
+        manager.wait_for_mon_quorum_size(len(mons))
+
+    # in forward and reverse order,
+    rmons = mons
+    rmons.reverse()
+    for mons in mons, rmons:
+        log.info('stopping all monitors')
+        for m in mons:
+            manager.kill_mon(m)
+
+        log.info('forming a minimal quorum for %s, then adding monitors' % mons)
+        qnum = (len(mons) // 2) + 1
+        num = 0
+        for m in mons:
+            manager.revive_mon(m)
+            num += 1
+            if num >= qnum:
+                manager.wait_for_mon_quorum_size(num)
+
+    # on both leader and non-leader ranks...
+    for rank in [0, 1]:
+        # take one out
+        log.info('removing mon %s' % mons[rank])
+        manager.kill_mon(mons[rank])
+        manager.wait_for_mon_quorum_size(len(mons) - 1)
+
+        log.info('causing some monitor log activity')
+        m = 30
+        for n in range(1, m):
+            manager.raw_cluster_cmd('log', '%d of %d' % (n, m))
+
+        log.info('adding mon %s back in' % mons[rank])
+        manager.revive_mon(mons[rank])
+        manager.wait_for_mon_quorum_size(len(mons))
diff --git a/qa/tasks/mon_thrash.py b/qa/tasks/mon_thrash.py
new file mode 100644
index 000000000..4224acf03
--- /dev/null
+++ b/qa/tasks/mon_thrash.py
@@ -0,0 +1,386 @@
+"""
+Monitor thrash
+"""
+import logging
+import contextlib
+import random
+import time
+import gevent
+import json
+import math
+from teuthology import misc as teuthology
+from tasks import ceph_manager
+from tasks.cephfs.filesystem import MDSCluster
+from tasks.thrasher import Thrasher
+
+log = logging.getLogger(__name__)
+
+def _get_mons(ctx):
+    """
+    Get monitor names from the context value.
+    """
+    mons = [f[len('mon.'):] for f in teuthology.get_mon_names(ctx)]
+    return mons
+
+class MonitorThrasher(Thrasher):
+    """
+    How it works::
+
+    - pick a monitor
+    - kill it
+    - wait for quorum to be formed
+    - sleep for 'revive_delay' seconds
+    - revive monitor
+    - wait for quorum to be formed
+    - sleep for 'thrash_delay' seconds
+
+    Options::
+
+    seed                Seed to use on the RNG to reproduce a previous
+                        behaviour (default: None; i.e., not set)
+    revive_delay        Number of seconds to wait before reviving
+                        the monitor (default: 10)
+    thrash_delay        Number of seconds to wait in-between
+                        test iterations (default: 0)
+    store_thrash        Thrash monitor store before killing the monitor being thrashed (default: False)
+    store_thrash_probability  Probability of thrashing a monitor's store
+                              (default: 50)
+    thrash_many         Thrash multiple monitors instead of just one. If
+                        'maintain_quorum' is set to False, then we will
+                        thrash up to as many monitors as there are
+                        available. (default: False)
+    maintain_quorum     Always maintain quorum, taking care on how many
+                        monitors we kill during the thrashing. If we
+                        happen to only have one or two monitors configured,
+                        if this option is set to True, then we won't run
+                        this task as we cannot guarantee maintenance of
+                        quorum. Setting it to false however would allow the
+                        task to run with as many as just one single monitor.
+                        (default: True)
+    freeze_mon_probability: how often to freeze the mon instead of killing it,
+                        in % (default: 0)
+    freeze_mon_duration: how many seconds to freeze the mon (default: 15)
+    scrub               Scrub after each iteration (default: True)
+    check_mds_failover  Check if mds failover happened (default: False)
+
+    Note: if 'store_thrash' is set to True, then 'maintain_quorum' must also
+          be set to True.
+
+    For example::
+
+    tasks:
+    - ceph:
+    - mon_thrash:
+        revive_delay: 20
+        thrash_delay: 1
+        store_thrash: true
+        store_thrash_probability: 40
+        seed: 31337
+        maintain_quorum: true
+        thrash_many: true
+        check_mds_failover: True
+    - ceph-fuse:
+    - workunit:
+        clients:
+          all:
+            - mon/workloadgen.sh
+    """
+    def __init__(self, ctx, manager, config, name, logger):
+        super(MonitorThrasher, self).__init__()
+
+        self.ctx = ctx
+        self.manager = manager
+        self.manager.wait_for_clean()
+
+        self.stopping = False
+        self.logger = logger
+        self.config = config
+        self.name = name
+
+        if self.config is None:
+            self.config = dict()
+
+        """ Test reproducibility """
+        self.random_seed = self.config.get('seed', None)
+
+        if self.random_seed is None:
+            self.random_seed = int(time.time())
+
+        self.rng = random.Random()
+        self.rng.seed(int(self.random_seed))
+
+        """ Monitor thrashing """
+        self.revive_delay = float(self.config.get('revive_delay', 10.0))
+        self.thrash_delay = float(self.config.get('thrash_delay', 0.0))
+
+        self.thrash_many = self.config.get('thrash_many', False)
+        self.maintain_quorum = self.config.get('maintain_quorum', True)
+
+        self.scrub = self.config.get('scrub', True)
+
+        self.freeze_mon_probability = float(self.config.get('freeze_mon_probability', 10))
+        self.freeze_mon_duration = float(self.config.get('freeze_mon_duration', 15.0))
+
+        assert self.max_killable() > 0, \
+            'Unable to kill at least one monitor with the current config.'
+
+        """ Store thrashing """
+        self.store_thrash = self.config.get('store_thrash', False)
+        self.store_thrash_probability = int(
+            self.config.get('store_thrash_probability', 50))
+        if self.store_thrash:
+            assert self.store_thrash_probability > 0, \
+                'store_thrash is set, probability must be > 0'
+            assert self.maintain_quorum, \
+                'store_thrash = true must imply maintain_quorum = true'
+
+        #MDS failover
+        self.mds_failover = self.config.get('check_mds_failover', False)
+
+        if self.mds_failover:
+            self.mds_cluster = MDSCluster(ctx)
+
+        self.thread = gevent.spawn(self.do_thrash)
+
+    def log(self, x):
+        """
+        locally log info messages
+        """
+        self.logger.info(x)
+
+    def do_join(self):
+        """
+        Break out of this processes thrashing loop.
+        """
+        self.stopping = True
+        self.thread.get()
+
+    def should_thrash_store(self):
+        """
+        If allowed, indicate that we should thrash a certain percentage of
+        the time as determined by the store_thrash_probability value.
+        """
+        if not self.store_thrash:
+            return False
+        return self.rng.randrange(0, 101) < self.store_thrash_probability
+
+    def thrash_store(self, mon):
+        """
+        Thrash the monitor specified.
+        :param mon: monitor to thrash
+        """
+        self.log('thrashing mon.{id} store'.format(id=mon))
+        out = self.manager.raw_cluster_cmd(
+            'tell', 'mon.%s' % mon, 'sync_force',
+            '--yes-i-really-mean-it')
+        j = json.loads(out)
+        assert j['ret'] == 0, \
+            'error forcing store sync on mon.{id}:\n{ret}'.format(
+                id=mon,ret=out)
+
+    def should_freeze_mon(self):
+        """
+        Indicate that we should freeze a certain percentago of the time
+        as determined by the freeze_mon_probability value.
+        """
+        return self.rng.randrange(0, 101) < self.freeze_mon_probability
+
+    def freeze_mon(self, mon):
+        """
+        Send STOP signal to freeze the monitor.
+        """
+        log.info('Sending STOP to mon %s', mon)
+        self.manager.signal_mon(mon, 19)  # STOP
+
+    def unfreeze_mon(self, mon):
+        """
+        Send CONT signal to unfreeze the monitor.
+        """
+        log.info('Sending CONT to mon %s', mon)
+        self.manager.signal_mon(mon, 18)  # CONT
+
+    def kill_mon(self, mon):
+        """
+        Kill the monitor specified
+        """
+        self.log('killing mon.{id}'.format(id=mon))
+        self.manager.kill_mon(mon)
+
+    def revive_mon(self, mon):
+        """
+        Revive the monitor specified
+        """
+        self.log('killing mon.{id}'.format(id=mon))
+        self.log('reviving mon.{id}'.format(id=mon))
+        self.manager.revive_mon(mon)
+
+    def max_killable(self):
+        """
+        Return the maximum number of monitors we can kill.
+        """
+        m = len(_get_mons(self.ctx))
+        if self.maintain_quorum:
+            return max(math.ceil(m/2.0)-1, 0)
+        else:
+            return m
+
+    def do_thrash(self):
+        """
+        _do_thrash() wrapper.
+        """
+        try:
+            self._do_thrash()
+        except Exception as e:
+            # See _run exception comment for MDSThrasher
+            self.set_thrasher_exception(e)
+            self.logger.exception("exception:")
+            # Allow successful completion so gevent doesn't see an exception.
+            # The DaemonWatchdog will observe the error and tear down the test.
+
+    def _do_thrash(self):
+        """
+        Continuously loop and thrash the monitors.
+        """
+        #status before mon thrashing
+        if self.mds_failover:
+            oldstatus = self.mds_cluster.status()
+
+        self.log('start thrashing')
+        self.log('seed: {s}, revive delay: {r}, thrash delay: {t} '\
+                   'thrash many: {tm}, maintain quorum: {mq} '\
+                   'store thrash: {st}, probability: {stp} '\
+                   'freeze mon: prob {fp} duration {fd}'.format(
+                s=self.random_seed,r=self.revive_delay,t=self.thrash_delay,
+                tm=self.thrash_many, mq=self.maintain_quorum,
+                st=self.store_thrash,stp=self.store_thrash_probability,
+                fp=self.freeze_mon_probability,fd=self.freeze_mon_duration,
+                ))
+
+        while not self.stopping:
+            mons = _get_mons(self.ctx)
+            self.manager.wait_for_mon_quorum_size(len(mons))
+            self.log('making sure all monitors are in the quorum')
+            for m in mons:
+                s = self.manager.get_mon_status(m)
+                assert s['state'] == 'leader' or s['state'] == 'peon'
+                assert len(s['quorum']) == len(mons)
+
+            kill_up_to = self.rng.randrange(1, self.max_killable()+1)
+            mons_to_kill = self.rng.sample(mons, kill_up_to)
+            self.log('monitors to thrash: {m}'.format(m=mons_to_kill))
+
+            mons_to_freeze = []
+            for mon in mons:
+                if mon in mons_to_kill:
+                    continue
+                if self.should_freeze_mon():
+                    mons_to_freeze.append(mon)
+            self.log('monitors to freeze: {m}'.format(m=mons_to_freeze))
+
+            for mon in mons_to_kill:
+                self.log('thrashing mon.{m}'.format(m=mon))
+
+                """ we only thrash stores if we are maintaining quorum """
+                if self.should_thrash_store() and self.maintain_quorum:
+                    self.thrash_store(mon)
+
+                self.kill_mon(mon)
+
+            if mons_to_freeze:
+                for mon in mons_to_freeze:
+                    self.freeze_mon(mon)
+                self.log('waiting for {delay} secs to unfreeze mons'.format(
+                    delay=self.freeze_mon_duration))
+                time.sleep(self.freeze_mon_duration)
+                for mon in mons_to_freeze:
+                    self.unfreeze_mon(mon)
+
+            if self.maintain_quorum:
+                self.manager.wait_for_mon_quorum_size(len(mons)-len(mons_to_kill))
+                for m in mons:
+                    if m in mons_to_kill:
+                        continue
+                    s = self.manager.get_mon_status(m)
+                    assert s['state'] == 'leader' or s['state'] == 'peon'
+                    assert len(s['quorum']) == len(mons)-len(mons_to_kill)
+
+            self.log('waiting for {delay} secs before reviving monitors'.format(
+                delay=self.revive_delay))
+            time.sleep(self.revive_delay)
+
+            for mon in mons_to_kill:
+                self.revive_mon(mon)
+            # do more freezes
+            if mons_to_freeze:
+                for mon in mons_to_freeze:
+                    self.freeze_mon(mon)
+                self.log('waiting for {delay} secs to unfreeze mons'.format(
+                    delay=self.freeze_mon_duration))
+                time.sleep(self.freeze_mon_duration)
+                for mon in mons_to_freeze:
+                    self.unfreeze_mon(mon)
+
+            self.manager.wait_for_mon_quorum_size(len(mons))
+            for m in mons:
+                s = self.manager.get_mon_status(m)
+                assert s['state'] == 'leader' or s['state'] == 'peon'
+                assert len(s['quorum']) == len(mons)
+
+            if self.scrub:
+                self.log('triggering scrub')
+                try:
+                    self.manager.raw_cluster_cmd('mon', 'scrub')
+                except Exception as e:
+                    log.warning("Ignoring exception while triggering scrub: %s", e)
+
+            if self.thrash_delay > 0.0:
+                self.log('waiting for {delay} secs before continuing thrashing'.format(
+                    delay=self.thrash_delay))
+                time.sleep(self.thrash_delay)
+
+        #status after thrashing
+        if self.mds_failover:
+            status = self.mds_cluster.status()
+            assert not oldstatus.hadfailover(status), \
+                'MDS Failover'
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Stress test the monitor by thrashing them while another task/workunit
+    is running.
+
+    Please refer to MonitorThrasher class for further information on the
+    available options.
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'mon_thrash task only accepts a dict for configuration'
+    assert len(_get_mons(ctx)) > 2, \
+        'mon_thrash task requires at least 3 monitors'
+
+    if 'cluster' not in config:
+        config['cluster'] = 'ceph'
+
+    log.info('Beginning mon_thrash...')
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+    thrash_proc = MonitorThrasher(ctx,
+        manager, config, "MonitorThrasher",
+        logger=log.getChild('mon_thrasher'))
+    ctx.ceph[config['cluster']].thrashers.append(thrash_proc)
+    try:
+        log.debug('Yielding')
+        yield
+    finally:
+        log.info('joining mon_thrasher')
+        thrash_proc.do_join()
+        mons = _get_mons(ctx)
+        manager.wait_for_mon_quorum_size(len(mons))
diff --git a/qa/tasks/multibench.py b/qa/tasks/multibench.py
new file mode 100644
index 000000000..c2a7299f1
--- /dev/null
+++ b/qa/tasks/multibench.py
@@ -0,0 +1,61 @@
+"""
+Multibench testing
+"""
+import contextlib
+import logging
+import time
+import copy
+import gevent
+
+from tasks import radosbench
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run multibench
+
+    The config should be as follows:
+
+    multibench:
+        time: <seconds to run total>
+        segments: <number of concurrent benches>
+        radosbench: <config for radosbench>
+
+    example:
+
+    tasks:
+    - ceph:
+    - multibench:
+        clients: [client.0]
+        time: 360
+    - interactive:
+    """
+    log.info('Beginning multibench...')
+    assert isinstance(config, dict), \
+        "please list clients to run on"
+
+    def run_one(num):
+        """Run test spawn from gevent"""
+        start = time.time()
+        if not config.get('radosbench'):
+            benchcontext = {}
+        else:
+            benchcontext = copy.copy(config.get('radosbench'))
+        iterations = 0
+        while time.time() - start < int(config.get('time', 600)):
+            log.info("Starting iteration %s of segment %s"%(iterations, num))
+            benchcontext['pool'] = str(num) + "-" + str(iterations)
+            with radosbench.task(ctx, benchcontext):
+                time.sleep()
+            iterations += 1
+    log.info("Starting %s threads"%(str(config.get('segments', 3)),))
+    segments = [
+        gevent.spawn(run_one, i)
+        for i in range(0, int(config.get('segments', 3)))]
+
+    try:
+        yield
+    finally:
+        [i.get() for i in segments]
diff --git a/qa/tasks/netem.py b/qa/tasks/netem.py
new file mode 100644
index 000000000..1d9fd98f7
--- /dev/null
+++ b/qa/tasks/netem.py
@@ -0,0 +1,268 @@
+"""
+Task to run tests with network delay between two remotes using tc and netem.
+Reference:https://wiki.linuxfoundation.org/networking/netem.
+
+"""
+
+import logging
+import contextlib
+from paramiko import SSHException
+import socket
+import time
+import gevent
+import argparse
+
+log = logging.getLogger(__name__)
+
+
+def set_priority(interface):
+
+    # create a priority queueing discipline
+    return ['sudo', 'tc', 'qdisc', 'add', 'dev', interface, 'root', 'handle', '1:', 'prio']
+
+
+def show_tc(interface):
+
+    # shows tc device present
+    return ['sudo', 'tc', 'qdisc', 'show', 'dev', interface]
+
+
+def del_tc(interface):
+
+    return ['sudo', 'tc', 'qdisc', 'del', 'dev', interface, 'root']
+
+
+def cmd_prefix(interface):
+
+    # prepare command to set delay
+    cmd1 = ['sudo', 'tc', 'qdisc', 'add', 'dev', interface, 'parent',
+                     '1:1', 'handle', '2:', 'netem', 'delay']
+
+    # prepare command to change delay
+    cmd2 = ['sudo', 'tc', 'qdisc', 'replace', 'dev', interface, 'root', 'netem', 'delay']
+
+    # prepare command to apply filter to the matched ip/host
+
+    cmd3 = ['sudo', 'tc', 'filter', 'add', 'dev', interface,
+                     'parent', '1:0', 'protocol', 'ip', 'pref', '55',
+                     'handle', '::55', 'u32', 'match', 'ip', 'dst']
+
+    return cmd1, cmd2, cmd3
+
+
+def static_delay(remote, host, interface, delay):
+
+    """ Sets a constant delay between two hosts to emulate network delays using tc qdisc and netem"""
+
+    set_delay, change_delay, set_ip = cmd_prefix(interface)
+
+    ip = socket.gethostbyname(host.hostname)
+
+    tc = remote.sh(show_tc(interface))
+    if tc.strip().find('refcnt') == -1:
+        # call set_priority() func to create priority queue
+        # if not already created(indicated by -1)
+        log.info('Create priority queue')
+        remote.run(args=set_priority(interface))
+
+        # set static delay, with +/- 5ms jitter with normal distribution as default
+        log.info('Setting delay to %s' % delay)
+        set_delay.extend(['%s' % delay, '5ms', 'distribution', 'normal'])
+        remote.run(args=set_delay)
+
+        # set delay to a particular remote node via ip
+        log.info('Delay set on %s' % remote)
+        set_ip.extend(['%s' % ip, 'flowid', '2:1'])
+        remote.run(args=set_ip)
+    else:
+        # if the device is already created, only change the delay
+        log.info('Setting delay to %s' % delay)
+        change_delay.extend(['%s' % delay, '5ms', 'distribution', 'normal'])
+        remote.run(args=change_delay)
+
+
+def variable_delay(remote, host, interface, delay_range=[]):
+
+    """ Vary delay between two values"""
+
+    set_delay, change_delay, set_ip = cmd_prefix(interface)
+
+    ip = socket.gethostbyname(host.hostname)
+
+    # delay1 has to be lower than delay2
+    delay1 = delay_range[0]
+    delay2 = delay_range[1]
+
+    tc = remote.sh(show_tc(interface))
+    if tc.strip().find('refcnt') == -1:
+        # call set_priority() func to create priority queue
+        # if not already created(indicated by -1)
+        remote.run(args=set_priority(interface))
+
+        # set variable delay
+        log.info('Setting varying delay')
+        set_delay.extend(['%s' % delay1, '%s' % delay2])
+        remote.run(args=set_delay)
+
+        # set delay to a particular remote node via ip
+        log.info('Delay set on %s' % remote)
+        set_ip.extend(['%s' % ip, 'flowid', '2:1'])
+        remote.run(args=set_ip)
+    else:
+        # if the device is already created, only change the delay
+        log.info('Setting varying delay')
+        change_delay.extend(['%s' % delay1, '%s' % delay2])
+        remote.run(args=change_delay)
+
+
+def delete_dev(remote, interface):
+
+    """ Delete the qdisc if present"""
+
+    log.info('Delete tc')
+    tc = remote.sh(show_tc(interface))
+    if tc.strip().find('refcnt') != -1:
+        remote.run(args=del_tc(interface))
+
+
+class Toggle:
+
+    stop_event = gevent.event.Event()
+
+    def __init__(self, ctx, remote, host, interface, interval):
+        self.ctx = ctx
+        self.remote = remote
+        self.host = host
+        self.interval = interval
+        self.interface = interface
+        self.ip = socket.gethostbyname(self.host.hostname)
+
+    def packet_drop(self):
+
+        """ Drop packets to the remote ip specified"""
+
+        _, _, set_ip = cmd_prefix(self.interface)
+
+        tc = self.remote.sh(show_tc(self.interface))
+        if tc.strip().find('refcnt') == -1:
+            self.remote.run(args=set_priority(self.interface))
+            # packet drop to specific ip
+            log.info('Drop all packets to %s' % self.host)
+            set_ip.extend(['%s' % self.ip, 'action', 'drop'])
+            self.remote.run(args=set_ip)
+
+    def link_toggle(self):
+
+        """
+         For toggling packet drop and recovery in regular interval.
+         If interval is 5s, link is up for 5s and link is down for 5s
+        """
+
+        while not self.stop_event.is_set():
+            self.stop_event.wait(timeout=self.interval)
+            # simulate link down
+            try:
+                self.packet_drop()
+                log.info('link down')
+            except SSHException:
+                log.debug('Failed to run command')
+
+            self.stop_event.wait(timeout=self.interval)
+            # if qdisc exist,delete it.
+            try:
+                delete_dev(self.remote, self.interface)
+                log.info('link up')
+            except SSHException:
+                log.debug('Failed to run command')
+
+    def begin(self, gname):
+        self.thread = gevent.spawn(self.link_toggle)
+        self.ctx.netem.names[gname] = self.thread
+
+    def end(self, gname):
+        self.stop_event.set()
+        log.info('gname is {}'.format(self.ctx.netem.names[gname]))
+        self.ctx.netem.names[gname].get()
+
+    def cleanup(self):
+        """
+        Invoked during unwinding if the test fails or exits before executing task 'link_recover'
+        """
+        log.info('Clean up')
+        self.stop_event.set()
+        self.thread.get()
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+
+    """
+    - netem:
+          clients: [c1.rgw.0]
+          iface: eno1
+          dst_client: [c2.rgw.1]
+          delay: 10ms
+
+    - netem:
+          clients: [c1.rgw.0]
+          iface: eno1
+          dst_client: [c2.rgw.1]
+          delay_range: [10ms, 20ms] # (min, max)
+
+    - netem:
+          clients: [rgw.1, mon.0]
+          iface: eno1
+          gname: t1
+          dst_client: [c2.rgw.1]
+          link_toggle_interval: 10 # no unit mentioned. By default takes seconds.
+
+    - netem:
+          clients: [rgw.1, mon.0]
+          iface: eno1
+          link_recover: [t1, t2]
+
+
+    """
+
+    log.info('config %s' % config)
+
+    assert isinstance(config, dict), \
+        "please list clients to run on"
+    if not hasattr(ctx, 'netem'):
+        ctx.netem = argparse.Namespace()
+        ctx.netem.names = {}
+
+    if config.get('dst_client') is not None:
+        dst = config.get('dst_client')
+        (host,) = ctx.cluster.only(dst).remotes.keys()
+
+    for role in config.get('clients', None):
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+        ctx.netem.remote = remote
+        if config.get('delay', False):
+            static_delay(remote, host, config.get('iface'), config.get('delay'))
+        if config.get('delay_range', False):
+            variable_delay(remote, host, config.get('iface'), config.get('delay_range'))
+        if config.get('link_toggle_interval', False):
+            log.info('Toggling link for %s' % config.get('link_toggle_interval'))
+            global toggle
+            toggle = Toggle(ctx, remote, host, config.get('iface'), config.get('link_toggle_interval'))
+            toggle.begin(config.get('gname'))
+        if config.get('link_recover', False):
+            log.info('Recovering link')
+            for gname in config.get('link_recover'):
+                toggle.end(gname)
+                log.info('sleeping')
+                time.sleep(config.get('link_toggle_interval'))
+                delete_dev(ctx.netem.remote, config.get('iface'))
+                del ctx.netem.names[gname]
+
+    try:
+        yield
+    finally:
+        if ctx.netem.names:
+            toggle.cleanup()
+        for role in config.get('clients'):
+            (remote,) = ctx.cluster.only(role).remotes.keys()
+            delete_dev(remote, config.get('iface'))
+
diff --git a/qa/tasks/netsplit.py b/qa/tasks/netsplit.py
new file mode 100644
index 000000000..b6614dc50
--- /dev/null
+++ b/qa/tasks/netsplit.py
@@ -0,0 +1,73 @@
+"""
+Functions to netsplit test machines.
+
+At present, you must specify monitors to disconnect, and it
+drops those IP pairs. This means OSDs etc on the hosts which use
+the same IP will also be blocked! If you are using multiple IPs on the
+same host within the cluster, daemons on those other IPs will get
+through.
+"""
+import logging
+import re
+
+log = logging.getLogger(__name__)
+
+def get_ip_and_ports(ctx, daemon):
+    assert daemon.startswith('mon.')
+    addr = ctx.ceph['ceph'].mons['{a}'.format(a=daemon)]
+    ips = re.findall("[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+[:[0-9]*]*", addr)
+    assert len(ips) > 0
+    plain_ip = re.match("[0-9\.]*", ips[0]).group()
+    assert plain_ip is not None
+    port_list = []
+    for ip in ips:
+        ip_str, port_str = re.match("([0-9\.]*)([:[0-9]*]*)", ip).groups()
+        assert ip_str == plain_ip
+        if len(port_str) > 0:
+            port_list.append(port_str)
+    return (plain_ip, port_list)
+
+def disconnect(ctx, config):
+    assert len(config) == 2 # we can only disconnect pairs right now
+    # and we can only disconnect mons right now
+    assert config[0].startswith('mon.')
+    assert config[1].startswith('mon.')
+    (ip1, _) = get_ip_and_ports(ctx, config[0])
+    (ip2, _) = get_ip_and_ports(ctx, config[1])
+
+    (host1,) = ctx.cluster.only(config[0]).remotes.iterkeys()
+    (host2,) = ctx.cluster.only(config[1]).remotes.iterkeys()
+    assert host1 is not None
+    assert host2 is not None
+
+    host1.run(
+        args = ["sudo", "iptables", "-A", "INPUT", "-p", "tcp", "-s",
+                ip2, "-j", "DROP"]
+    )
+    host2.run(
+        args = ["sudo", "iptables", "-A", "INPUT", "-p", "tcp", "-s",
+                ip1, "-j", "DROP"]
+    )
+
+def reconnect(ctx, config):
+    assert len(config) == 2 # we can only disconnect pairs right now
+    # and we can only disconnect mons right now
+    assert config[0].startswith('mon.')
+    assert config[1].startswith('mon.')
+
+    (ip1, _) = get_ip_and_ports(ctx, config[0])
+    (ip2, _) = get_ip_and_ports(ctx, config[1])
+
+    (host1,) = ctx.cluster.only(config[0]).remotes.iterkeys()
+    (host2,) = ctx.cluster.only(config[1]).remotes.iterkeys()
+    assert host1 is not None
+    assert host2 is not None
+
+    host1.run(
+        args = ["sudo", "iptables", "-D", "INPUT", "-p", "tcp", "-s",
+                ip2, "-j", "DROP"]
+    )
+    host2.run(
+        args = ["sudo", "iptables", "-D", "INPUT", "-p", "tcp", "-s",
+                ip1, "-j", "DROP"]
+    )
diff --git a/qa/tasks/nvme_loop.py b/qa/tasks/nvme_loop.py
new file mode 100644
index 000000000..1501ad636
--- /dev/null
+++ b/qa/tasks/nvme_loop.py
@@ -0,0 +1,101 @@
+import contextlib
+import logging
+
+from io import StringIO
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.orchestra import run
+
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    log.info('Setting up nvme_loop on scratch devices...')
+    host = 'hostnqn'
+    port = '1'
+    devs_by_remote = {}
+    old_scratch_by_remote = {}
+    for remote, roles in ctx.cluster.remotes.items():
+        devs = teuthology.get_scratch_devices(remote)
+        devs_by_remote[remote] = devs
+        base = '/sys/kernel/config/nvmet'
+        remote.run(
+            args=[
+                'sudo', 'modprobe', 'nvme_loop',
+                run.Raw('&&'),
+                'sudo', 'mkdir', '-p', f'{base}/hosts/{host}',
+                run.Raw('&&'),
+                'sudo', 'mkdir', '-p', f'{base}/ports/{port}',
+                run.Raw('&&'),
+                'echo', 'loop', run.Raw('|'),
+                'sudo', 'tee', f'{base}/ports/{port}/addr_trtype',
+            ]
+        )
+        for dev in devs:
+            short = dev.split('/')[-1]
+            log.info(f'Connecting nvme_loop {remote.shortname}:{dev}...')
+            remote.run(
+                args=[
+                    'sudo', 'mkdir', '-p', f'{base}/subsystems/{short}',
+                    run.Raw('&&'),
+                    'echo', '1', run.Raw('|'),
+                    'sudo', 'tee', f'{base}/subsystems/{short}/attr_allow_any_host',
+                    run.Raw('&&'),
+                    'sudo', 'mkdir', '-p', f'{base}/subsystems/{short}/namespaces/1',
+                    run.Raw('&&'),
+                    'echo', '-n', dev, run.Raw('|'),
+                    'sudo', 'tee', f'{base}/subsystems/{short}/namespaces/1/device_path',
+                    run.Raw('&&'),
+                    'echo', '1', run.Raw('|'),
+                    'sudo', 'tee', f'{base}/subsystems/{short}/namespaces/1/enable',
+                    run.Raw('&&'),
+                    'sudo', 'ln', '-s', f'{base}/subsystems/{short}',
+                    f'{base}/ports/{port}/subsystems/{short}',
+                    run.Raw('&&'),
+                    'sudo', 'nvme', 'connect', '-t', 'loop', '-n', short, '-q', host,
+                ]
+            )
+
+        # identify nvme_loops devices
+        old_scratch_by_remote[remote] = remote.read_file('/scratch_devs')
+
+        with contextutil.safe_while(sleep=1, tries=15) as proceed:
+            while proceed():
+                p = remote.run(args=['sudo', 'nvme', 'list'], stdout=StringIO())
+                new_devs = []
+                for line in p.stdout.getvalue().splitlines():
+                    dev, _, vendor = line.split()[0:3]
+                    if dev.startswith('/dev/') and vendor == 'Linux':
+                        new_devs.append(dev)
+                log.info(f'new_devs {new_devs}')
+                assert len(new_devs) <= len(devs)
+                if len(new_devs) == len(devs):
+                    break
+
+        remote.write_file(
+            path='/scratch_devs',
+            data='\n'.join(new_devs) + '\n',
+            sudo=True
+        )
+
+    try:
+        yield
+
+    finally:
+        for remote, devs in devs_by_remote.items():
+            for dev in devs:
+                short = dev.split('/')[-1]
+                log.info(f'Disconnecting nvme_loop {remote.shortname}:{dev}...')
+                remote.run(
+                    args=[
+                        'sudo', 'nvme', 'disconnect', '-n', short
+                    ],
+                    check_status=False,
+                )
+            remote.write_file(
+                path='/scratch_devs',
+                data=old_scratch_by_remote[remote],
+                sudo=True
+            )
diff --git a/qa/tasks/object_source_down.py b/qa/tasks/object_source_down.py
new file mode 100644
index 000000000..e4519bb6f
--- /dev/null
+++ b/qa/tasks/object_source_down.py
@@ -0,0 +1,101 @@
+"""
+Test Object locations going down
+"""
+import logging
+import time
+from teuthology import misc as teuthology
+from tasks import ceph_manager
+from tasks.util.rados import rados
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Test handling of object location going down
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'lost_unfound task only accepts a dict for configuration'
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+    manager.wait_for_clean()
+
+    # something that is always there
+    dummyfile = '/etc/fstab'
+
+    # take 0, 1 out
+    manager.mark_out_osd(0)
+    manager.mark_out_osd(1)
+    manager.wait_for_clean()
+
+    # delay recovery, and make the pg log very long (to prevent backfill)
+    manager.raw_cluster_cmd(
+            'tell', 'osd.0',
+            'injectargs',
+            '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000'
+            )
+    # delay recovery, and make the pg log very long (to prevent backfill)
+    manager.raw_cluster_cmd(
+            'tell', 'osd.1',
+            'injectargs',
+            '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000'
+            )
+    # delay recovery, and make the pg log very long (to prevent backfill)
+    manager.raw_cluster_cmd(
+            'tell', 'osd.2',
+            'injectargs',
+            '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000'
+            )
+    # delay recovery, and make the pg log very long (to prevent backfill)
+    manager.raw_cluster_cmd(
+            'tell', 'osd.3',
+            'injectargs',
+            '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000'
+            )
+
+    # kludge to make sure they get a map
+    rados(ctx, mon, ['-p', 'data', 'put', 'dummy', dummyfile])
+
+    # create old objects
+    for f in range(1, 10):
+        rados(ctx, mon, ['-p', 'data', 'put', 'existing_%d' % f, dummyfile])
+
+    manager.mark_out_osd(3)
+    manager.wait_till_active()
+
+    manager.mark_in_osd(0)
+    manager.wait_till_active()
+
+    manager.flush_pg_stats([2, 0])
+
+    manager.mark_out_osd(2)
+    manager.wait_till_active()
+
+    # bring up 1
+    manager.mark_in_osd(1)
+    manager.wait_till_active()
+
+    manager.flush_pg_stats([0, 1])
+    log.info("Getting unfound objects")
+    unfound = manager.get_num_unfound_objects()
+    assert not unfound
+
+    manager.kill_osd(2)
+    manager.mark_down_osd(2)
+    manager.kill_osd(3)
+    manager.mark_down_osd(3)
+
+    manager.flush_pg_stats([0, 1])
+    log.info("Getting unfound objects")
+    unfound = manager.get_num_unfound_objects()
+    assert unfound
diff --git a/qa/tasks/omapbench.py b/qa/tasks/omapbench.py
new file mode 100644
index 000000000..a5bd3a4df
--- /dev/null
+++ b/qa/tasks/omapbench.py
@@ -0,0 +1,83 @@
+"""
+Run omapbench executable within teuthology
+"""
+import contextlib
+import logging
+
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run omapbench
+
+    The config should be as follows::
+
+		  omapbench:
+		      clients: [client list]
+		      threads: <threads at once>
+		      objects: <number of objects to write>
+		      entries: <number of entries per object map>
+		      keysize: <number of characters per object map key>
+		      valsize: <number of characters per object map val>
+		      increment: <interval to show in histogram (in ms)>
+		      omaptype: <how the omaps should be generated>
+
+    example::
+
+		  tasks:
+		  - ceph:
+		  - omapbench:
+		      clients: [client.0]
+		      threads: 30
+		      objects: 1000
+		      entries: 10
+		      keysize: 10
+		      valsize: 100
+		      increment: 100
+		      omaptype: uniform
+		  - interactive:
+    """
+    log.info('Beginning omapbench...')
+    assert isinstance(config, dict), \
+        "please list clients to run on"
+    omapbench = {}
+    testdir = teuthology.get_testdir(ctx)
+    print(str(config.get('increment',-1)))
+    for role in config.get('clients', ['client.0']):
+        assert isinstance(role, str)
+        PREFIX = 'client.'
+        assert role.startswith(PREFIX)
+        id_ = role[len(PREFIX):]
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+        proc = remote.run(
+            args=[
+                "/bin/sh", "-c",
+                " ".join(['adjust-ulimits',
+                          'ceph-coverage',
+                          '{tdir}/archive/coverage',
+                          'omapbench',
+                          '--name', role[len(PREFIX):],
+                          '-t', str(config.get('threads', 30)),
+                          '-o', str(config.get('objects', 1000)),
+                          '--entries', str(config.get('entries',10)),
+                          '--keysize', str(config.get('keysize',10)),
+                          '--valsize', str(config.get('valsize',1000)),
+                          '--inc', str(config.get('increment',10)),
+                          '--omaptype', str(config.get('omaptype','uniform'))
+                          ]).format(tdir=testdir),
+                ],
+            logger=log.getChild('omapbench.{id}'.format(id=id_)),
+            stdin=run.PIPE,
+            wait=False
+            )
+        omapbench[id_] = proc
+
+    try:
+        yield
+    finally:
+        log.info('joining omapbench')
+        run.wait(omapbench.values())
diff --git a/qa/tasks/openssl_keys.py b/qa/tasks/openssl_keys.py
new file mode 100644
index 000000000..f9a7f7ede
--- /dev/null
+++ b/qa/tasks/openssl_keys.py
@@ -0,0 +1,245 @@
+"""
+Generates and installs a signed SSL certificate.
+"""
+import argparse
+import logging
+import os
+
+from teuthology import misc
+from teuthology.exceptions import ConfigError
+from teuthology.orchestra import run
+from teuthology.task import Task
+
+log = logging.getLogger(__name__)
+
+class OpenSSLKeys(Task):
+    name = 'openssl_keys'
+    """
+    Generates and installs a signed SSL certificate.
+
+    To create a self-signed certificate:
+
+        - openssl_keys:
+            # certificate name
+            root: # results in root.key and root.crt
+
+              # [required] make the private key and certificate available in this client's test directory
+              client: client.0
+
+              # common name, defaults to `hostname`. chained certificates must not share a common name
+              cn: teuthology
+
+              # private key type for -newkey, defaults to rsa:2048
+              key-type: rsa:4096
+
+              # install the certificate as trusted on these clients:
+              install: [client.0, client.1]
+
+
+    To create a certificate signed by a ca certificate:
+
+        - openssl_keys:
+            root: (self-signed certificate as above)
+              ...
+
+            cert-for-client1:
+              client: client.1
+
+              # use another ssl certificate (by 'name') as the certificate authority
+              ca: root  # --CAkey=root.key -CA=root.crt
+
+              # embed the private key in the certificate file
+              embed-key: true
+    """
+
+    def __init__(self, ctx, config):
+        super(OpenSSLKeys, self).__init__(ctx, config)
+        self.certs = []
+        self.installed = []
+
+    def setup(self):
+        # global dictionary allows other tasks to look up certificate paths
+        if not hasattr(self.ctx, 'ssl_certificates'):
+            self.ctx.ssl_certificates = {}
+
+        # use testdir/ca as a working directory
+        self.cadir = '/'.join((misc.get_testdir(self.ctx), 'ca'))
+        # make sure self-signed certs get added first, they don't have 'ca' field
+        configs = sorted(self.config.items(), key=lambda x: 'ca' in x[1])
+        for name, config in configs:
+            # names must be unique to avoid clobbering each others files
+            if name in self.ctx.ssl_certificates:
+                raise ConfigError('ssl: duplicate certificate name {}'.format(name))
+
+            # create the key and certificate
+            cert = self.create_cert(name, config)
+
+            self.ctx.ssl_certificates[name] = cert
+            self.certs.append(cert)
+
+            # install as trusted on the requested clients
+            for client in config.get('install', []):
+                installed = self.install_cert(cert, client)
+                self.installed.append(installed)
+
+    def teardown(self):
+        """
+        Clean up any created/installed certificate files.
+        """
+        for cert in self.certs:
+            self.remove_cert(cert)
+
+        for installed in self.installed:
+            self.uninstall_cert(installed)
+
+    def create_cert(self, name, config):
+        """
+        Create a certificate with the given configuration.
+        """
+        cert = argparse.Namespace()
+        cert.name = name
+        cert.key_type = config.get('key-type', 'rsa:2048')
+
+        cert.client = config.get('client', None)
+        if not cert.client:
+            raise ConfigError('ssl: missing required field "client"')
+
+        (cert.remote,) = self.ctx.cluster.only(cert.client).remotes.keys()
+
+        cert.remote.run(args=['mkdir', '-p', self.cadir])
+
+        cert.key = f'{self.cadir}/{cert.name}.key'
+        cert.certificate = f'{self.cadir}/{cert.name}.crt'
+
+        san_ext = []
+        add_san_default = False
+        cn = config.get('cn', '')
+        if cn == '':
+            cn = cert.remote.hostname
+            add_san_default = True
+        if config.get('add-san', add_san_default):
+            ext = f'{self.cadir}/{cert.name}.ext'
+            san_ext = ['-extfile', ext]
+
+        # provide the common name in -subj to avoid the openssl command prompts
+        subject = f'/CN={cn}'
+
+        # if a ca certificate is provided, use it to sign the new certificate
+        ca = config.get('ca', None)
+        if ca:
+            # the ca certificate must have been created by a prior ssl task
+            ca_cert = self.ctx.ssl_certificates.get(ca, None)
+            if not ca_cert:
+                raise ConfigError(f'ssl: ca {ca} not found for certificate {cert.name}')
+
+            csr = f'{self.cadir}/{cert.name}.csr'
+            srl = f'{self.cadir}/{ca_cert.name}.srl'
+            remove_files = ['rm', csr, srl]
+
+            # these commands are run on the ca certificate's client because
+            # they need access to its private key and cert
+
+            # generate a private key and signing request
+            ca_cert.remote.run(args=['openssl', 'req', '-nodes',
+                '-newkey', cert.key_type, '-keyout', cert.key,
+                '-out', csr, '-subj', subject])
+
+            if san_ext:
+                remove_files.append(ext)
+                ca_cert.remote.write_file(path=ext,
+                    data='subjectAltName = DNS:{},IP:{}'.format(
+                        cn,
+                        config.get('ip', cert.remote.ip_address)))
+
+            # create the signed certificate
+            ca_cert.remote.run(args=['openssl', 'x509', '-req', '-in', csr,
+                '-CA', ca_cert.certificate, '-CAkey', ca_cert.key, '-CAcreateserial',
+                '-out', cert.certificate, '-days', '365', '-sha256'] + san_ext)
+
+            ca_cert.remote.run(args=remove_files) # clean up the signing request and serial
+
+            # verify the new certificate against its ca cert
+            ca_cert.remote.run(args=['openssl', 'verify',
+                '-CAfile', ca_cert.certificate, cert.certificate])
+
+            if cert.remote != ca_cert.remote:
+                # copy to remote client
+                self.remote_copy_file(ca_cert.remote, cert.certificate, cert.remote, cert.certificate)
+                self.remote_copy_file(ca_cert.remote, cert.key, cert.remote, cert.key)
+                # clean up the local copies
+                ca_cert.remote.run(args=['rm', cert.certificate, cert.key])
+                # verify the remote certificate (requires ca to be in its trusted ca certificate store)
+                cert.remote.run(args=['openssl', 'verify', cert.certificate])
+        else:
+            # otherwise, generate a private key and use it to self-sign a new certificate
+            cert.remote.run(args=['openssl', 'req', '-x509', '-nodes',
+                '-newkey', cert.key_type, '-keyout', cert.key,
+                '-days', '365', '-out', cert.certificate, '-subj', subject])
+
+        if config.get('embed-key', False):
+            # append the private key to the certificate file
+            cert.remote.run(args=['cat', cert.key, run.Raw('>>'), cert.certificate])
+
+        return cert
+
+    def remove_cert(self, cert):
+        """
+        Delete all of the files associated with the given certificate.
+        """
+        # remove the private key and certificate
+        cert.remote.run(args=['rm', '-f', cert.certificate, cert.key])
+
+        # remove ca subdirectory if it's empty
+        cert.remote.run(args=['rmdir', '--ignore-fail-on-non-empty', self.cadir])
+
+    def install_cert(self, cert, client):
+        """
+        Install as a trusted ca certificate on the given client.
+        """
+        (remote,) = self.ctx.cluster.only(client).remotes.keys()
+
+        installed = argparse.Namespace()
+        installed.remote = remote
+
+        if remote.os.package_type == 'deb':
+            installed.path = '/usr/local/share/ca-certificates/{}.crt'.format(cert.name)
+            installed.command = ['sudo', 'update-ca-certificates']
+        else:
+            installed.path = '/usr/share/pki/ca-trust-source/anchors/{}.crt'.format(cert.name)
+            installed.command = ['sudo', 'update-ca-trust']
+
+        cp_or_mv = 'cp'
+        if remote != cert.remote:
+            # copy into remote cadir (with mkdir if necessary)
+            remote.run(args=['mkdir', '-p', self.cadir])
+            self.remote_copy_file(cert.remote, cert.certificate, remote, cert.certificate)
+            cp_or_mv = 'mv' # move this remote copy into the certificate store
+
+        # install into certificate store as root
+        remote.run(args=['sudo', cp_or_mv, cert.certificate, installed.path])
+        remote.run(args=installed.command)
+
+        return installed
+
+    def uninstall_cert(self, installed):
+        """
+        Uninstall a certificate from the trusted certificate store.
+        """
+        installed.remote.run(args=['sudo', 'rm', installed.path])
+        installed.remote.run(args=installed.command)
+
+    def remote_copy_file(self, from_remote, from_path, to_remote, to_path):
+        """
+        Copies a file from one remote to another.
+
+        The remotes don't have public-key auth for 'scp' or misc.copy_file(),
+        so this copies through an intermediate local tmp file.
+        """
+        log.info('copying from {}:{} to {}:{}...'.format(from_remote, from_path, to_remote, to_path))
+        local_path = from_remote.get_file(from_path)
+        try:
+            to_remote.put_file(local_path, to_path)
+        finally:
+            os.remove(local_path)
+
+task = OpenSSLKeys
diff --git a/qa/tasks/osd_backfill.py b/qa/tasks/osd_backfill.py
new file mode 100644
index 000000000..b33e1c912
--- /dev/null
+++ b/qa/tasks/osd_backfill.py
@@ -0,0 +1,104 @@
+"""
+Osd backfill test
+"""
+import logging
+import time
+from tasks import ceph_manager
+from teuthology import misc as teuthology
+
+
+log = logging.getLogger(__name__)
+
+
+def rados_start(ctx, remote, cmd):
+    """
+    Run a remote rados command (currently used to only write data)
+    """
+    log.info("rados %s" % ' '.join(cmd))
+    testdir = teuthology.get_testdir(ctx)
+    pre = [
+        'adjust-ulimits',
+        'ceph-coverage',
+        '{tdir}/archive/coverage'.format(tdir=testdir),
+        'rados',
+        ];
+    pre.extend(cmd)
+    proc = remote.run(
+        args=pre,
+        wait=False,
+        )
+    return proc
+
+def task(ctx, config):
+    """
+    Test backfill
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'thrashosds task only accepts a dict for configuration'
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+
+    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+    log.info('num_osds is %s' % num_osds)
+    assert num_osds == 3
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+    manager.flush_pg_stats([0, 1, 2])
+    manager.wait_for_clean()
+
+    # write some data
+    p = rados_start(ctx, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096',
+                          '--no-cleanup'])
+    err = p.wait()
+    log.info('err is %d' % err)
+
+    # mark osd.0 out to trigger a rebalance/backfill
+    manager.mark_out_osd(0)
+
+    # also mark it down to it won't be included in pg_temps
+    manager.kill_osd(0)
+    manager.mark_down_osd(0)
+
+    # wait for everything to peer and be happy...
+    manager.flush_pg_stats([1, 2])
+    manager.wait_for_recovery()
+
+    # write some new data
+    p = rados_start(ctx, mon, ['-p', 'rbd', 'bench', '30', 'write', '-b', '4096',
+                          '--no-cleanup'])
+
+    time.sleep(15)
+
+    # blackhole + restart osd.1
+    # this triggers a divergent backfill target
+    manager.blackhole_kill_osd(1)
+    time.sleep(2)
+    manager.revive_osd(1)
+
+    # wait for our writes to complete + succeed
+    err = p.wait()
+    log.info('err is %d' % err)
+
+    # wait for osd.1 and osd.2 to be up
+    manager.wait_till_osd_is_up(1)
+    manager.wait_till_osd_is_up(2)
+
+    # cluster must recover
+    manager.flush_pg_stats([1, 2])
+    manager.wait_for_recovery()
+
+    # re-add osd.0
+    manager.revive_osd(0)
+    manager.flush_pg_stats([1, 2])
+    manager.wait_for_clean()
+
+
diff --git a/qa/tasks/osd_failsafe_enospc.py b/qa/tasks/osd_failsafe_enospc.py
new file mode 100644
index 000000000..fe2996a78
--- /dev/null
+++ b/qa/tasks/osd_failsafe_enospc.py
@@ -0,0 +1,218 @@
+"""
+Handle osdfailsafe configuration settings (nearfull ratio and full ratio)
+"""
+from io import StringIO
+import logging
+import time
+
+from teuthology.orchestra import run
+from tasks.util.rados import rados
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio
+    configuration settings
+
+    In order for test to pass must use log-ignorelist as follows
+
+        tasks:
+            - chef:
+            - install:
+            - ceph:
+                log-ignorelist: ['OSD near full', 'OSD full dropping all updates']
+            - osd_failsafe_enospc:
+
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'osd_failsafe_enospc task only accepts a dict for configuration'
+
+    # Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding
+    sleep_time = 50
+
+    # something that is always there
+    dummyfile = '/etc/fstab'
+    dummyfile2 = '/etc/resolv.conf'
+
+    manager = ctx.managers['ceph']
+
+    # create 1 pg pool with 1 rep which can only be on osd.0
+    osds = manager.get_osd_dump()
+    for osd in osds:
+        if osd['osd'] != 0:
+            manager.mark_out_osd(osd['osd'])
+
+    log.info('creating pool foo')
+    manager.create_pool("foo")
+    manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1')
+
+    # State NONE -> NEAR
+    log.info('1. Verify warning messages when exceeding nearfull_ratio')
+
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+
+    proc = mon.run(
+             args=[
+                 'sudo',
+                 'daemon-helper',
+                 'kill',
+                 'ceph', '-w'
+             ],
+             stdin=run.PIPE,
+             stdout=StringIO(),
+             wait=False,
+        )
+
+    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001')
+
+    time.sleep(sleep_time)
+    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
+    proc.wait()
+
+    lines = proc.stdout.getvalue().split('\n')
+
+    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
+    assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count
+    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
+    assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
+
+    # State NEAR -> FULL
+    log.info('2. Verify error messages when exceeding full_ratio')
+
+    proc = mon.run(
+             args=[
+                 'sudo',
+                 'daemon-helper',
+                 'kill',
+                 'ceph', '-w'
+             ],
+             stdin=run.PIPE,
+             stdout=StringIO(),
+             wait=False,
+        )
+
+    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
+
+    time.sleep(sleep_time)
+    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
+    proc.wait()
+
+    lines = proc.stdout.getvalue().split('\n')
+
+    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
+    assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
+
+    log.info('3. Verify write failure when exceeding full_ratio')
+
+    # Write data should fail
+    ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile])
+    assert ret != 0, 'Expected write failure but it succeeded with exit status 0'
+
+    # Put back default
+    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
+    time.sleep(10)
+
+    # State FULL -> NEAR
+    log.info('4. Verify write success when NOT exceeding full_ratio')
+
+    # Write should succeed
+    ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2])
+    assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret
+
+    log.info('5. Verify warning messages again when exceeding nearfull_ratio')
+
+    proc = mon.run(
+             args=[
+                 'sudo',
+                 'daemon-helper',
+                 'kill',
+                 'ceph', '-w'
+             ],
+             stdin=run.PIPE,
+             stdout=StringIO(),
+             wait=False,
+        )
+
+    time.sleep(sleep_time)
+    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
+    proc.wait()
+
+    lines = proc.stdout.getvalue().split('\n')
+
+    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
+    assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count
+    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
+    assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
+
+    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90')
+    time.sleep(10)
+
+    # State NONE -> FULL
+    log.info('6. Verify error messages again when exceeding full_ratio')
+
+    proc = mon.run(
+             args=[
+                 'sudo',
+                 'daemon-helper',
+                 'kill',
+                 'ceph', '-w'
+             ],
+             stdin=run.PIPE,
+             stdout=StringIO(),
+             wait=False,
+        )
+
+    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
+
+    time.sleep(sleep_time)
+    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
+    proc.wait()
+
+    lines = proc.stdout.getvalue().split('\n')
+
+    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
+    assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
+    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
+    assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
+
+    # State FULL -> NONE
+    log.info('7. Verify no messages settings back to default')
+
+    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
+    time.sleep(10)
+
+    proc = mon.run(
+             args=[
+                 'sudo',
+                 'daemon-helper',
+                 'kill',
+                 'ceph', '-w'
+             ],
+             stdin=run.PIPE,
+             stdout=StringIO(),
+             wait=False,
+        )
+
+    time.sleep(sleep_time)
+    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
+    proc.wait()
+
+    lines = proc.stdout.getvalue().split('\n')
+
+    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
+    assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
+    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
+    assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
+
+    log.info('Test Passed')
+
+    # Bring all OSDs back in
+    manager.remove_pool("foo")
+    for osd in osds:
+        if osd['osd'] != 0:
+            manager.mark_in_osd(osd['osd'])
diff --git a/qa/tasks/osd_max_pg_per_osd.py b/qa/tasks/osd_max_pg_per_osd.py
new file mode 100644
index 000000000..6680fe6e3
--- /dev/null
+++ b/qa/tasks/osd_max_pg_per_osd.py
@@ -0,0 +1,126 @@
+import logging
+import random
+
+
+log = logging.getLogger(__name__)
+
+
+def pg_num_in_all_states(pgs, *states):
+    return sum(1 for state in pgs.values()
+               if all(s in state for s in states))
+
+
+def pg_num_in_any_state(pgs, *states):
+    return sum(1 for state in pgs.values()
+               if any(s in state for s in states))
+
+
+def test_create_from_mon(ctx, config):
+    """
+    osd should stop creating new pools if the number of pg it servers
+    exceeds the max-pg-per-osd setting, and it should resume the previously
+    suspended pg creations once the its pg number drops down below the setting
+    How it works::
+    1. set the hard limit of pg-per-osd to "2"
+    2. create pool.a with pg_num=2
+       # all pgs should be active+clean
+    2. create pool.b with pg_num=2
+       # new pgs belonging to this pool should be unknown (the primary osd
+       reaches the limit) or creating (replica osd reaches the limit)
+    3. remove pool.a
+    4. all pg belonging to pool.b should be active+clean
+    """
+    pg_num = config.get('pg_num', 2)
+    manager = ctx.managers['ceph']
+    log.info('1. creating pool.a')
+    pool_a = manager.create_pool_with_unique_name(pg_num)
+    pg_states = manager.wait_till_pg_convergence(300)
+    pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+    assert pg_created == pg_num
+
+    log.info('2. creating pool.b')
+    pool_b = manager.create_pool_with_unique_name(pg_num)
+    pg_states = manager.wait_till_pg_convergence(300)
+    pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+    assert pg_created == pg_num
+    pg_pending = pg_num_in_any_state(pg_states, 'unknown', 'creating')
+    assert pg_pending == pg_num
+
+    log.info('3. removing pool.a')
+    manager.remove_pool(pool_a)
+    pg_states = manager.wait_till_pg_convergence(300)
+    assert len(pg_states) == pg_num
+    pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+    assert pg_created == pg_num
+
+    # cleanup
+    manager.remove_pool(pool_b)
+
+
+def test_create_from_peer(ctx, config):
+    """
+    osd should stop creating new pools if the number of pg it servers
+    exceeds the max-pg-per-osd setting, and it should resume the previously
+    suspended pg creations once the its pg number drops down below the setting
+
+    How it works::
+    0. create 4 OSDs.
+    1. create pool.a with pg_num=1, size=2
+       pg will be mapped to osd.0, and osd.1, and it should be active+clean
+    2. create pool.b with pg_num=1, size=2.
+       if the pgs stuck in creating, delete the pool since the pool and try
+       again, eventually we'll get the pool to land on the other 2 osds that
+       aren't occupied by pool.a. (this will also verify that pgs for deleted
+       pools get cleaned out of the creating wait list.)
+    3. mark an osd out. verify that some pgs get stuck stale or peering.
+    4. delete a pool, verify pgs go active.
+    """
+    pg_num = config.get('pg_num', 1)
+    from_primary = config.get('from_primary', True)
+
+    manager = ctx.managers['ceph']
+    log.info('1. creating pool.a')
+    pool_a = manager.create_pool_with_unique_name(pg_num)
+    pg_states = manager.wait_till_pg_convergence(300)
+    pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+    assert pg_created == pg_num
+
+    log.info('2. creating pool.b')
+    while True:
+        pool_b = manager.create_pool_with_unique_name(pg_num)
+        pg_states = manager.wait_till_pg_convergence(300)
+        pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+        assert pg_created >= pg_num
+        pg_pending = pg_num_in_any_state(pg_states, 'unknown', 'creating')
+        assert pg_pending == pg_num * 2 - pg_created
+        if pg_created == pg_num * 2:
+            break
+        manager.remove_pool(pool_b)
+
+    log.info('3. mark an osd out')
+    pg_stats = manager.get_pg_stats()
+    pg = random.choice(pg_stats)
+    if from_primary:
+        victim = pg['acting'][-1]
+    else:
+        victim = pg['acting'][0]
+    manager.mark_out_osd(victim)
+    pg_states = manager.wait_till_pg_convergence(300)
+    pg_stuck = pg_num_in_any_state(pg_states, 'activating', 'stale', 'peering')
+    assert pg_stuck > 0
+
+    log.info('4. removing pool.b')
+    manager.remove_pool(pool_b)
+    manager.wait_for_clean(30)
+
+    # cleanup
+    manager.remove_pool(pool_a)
+
+
+def task(ctx, config):
+    assert isinstance(config, dict), \
+        'osd_max_pg_per_osd task only accepts a dict for config'
+    if config.get('test_create_from_mon', True):
+        test_create_from_mon(ctx, config)
+    else:
+        test_create_from_peer(ctx, config)
diff --git a/qa/tasks/osd_recovery.py b/qa/tasks/osd_recovery.py
new file mode 100644
index 000000000..b0623c21b
--- /dev/null
+++ b/qa/tasks/osd_recovery.py
@@ -0,0 +1,193 @@
+"""
+osd recovery
+"""
+import logging
+import time
+from tasks import ceph_manager
+from teuthology import misc as teuthology
+
+
+log = logging.getLogger(__name__)
+
+
+def rados_start(testdir, remote, cmd):
+    """
+    Run a remote rados command (currently used to only write data)
+    """
+    log.info("rados %s" % ' '.join(cmd))
+    pre = [
+        'adjust-ulimits',
+        'ceph-coverage',
+        '{tdir}/archive/coverage'.format(tdir=testdir),
+        'rados',
+        ];
+    pre.extend(cmd)
+    proc = remote.run(
+        args=pre,
+        wait=False,
+        )
+    return proc
+
+def task(ctx, config):
+    """
+    Test (non-backfill) recovery
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'task only accepts a dict for configuration'
+    testdir = teuthology.get_testdir(ctx)
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+
+    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+    log.info('num_osds is %s' % num_osds)
+    assert num_osds == 3
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+    manager.flush_pg_stats([0, 1, 2])
+    manager.wait_for_clean()
+
+    # test some osdmap flags
+    manager.raw_cluster_cmd('osd', 'set', 'noin')
+    manager.raw_cluster_cmd('osd', 'set', 'noout')
+    manager.raw_cluster_cmd('osd', 'set', 'noup')
+    manager.raw_cluster_cmd('osd', 'set', 'nodown')
+    manager.raw_cluster_cmd('osd', 'unset', 'noin')
+    manager.raw_cluster_cmd('osd', 'unset', 'noout')
+    manager.raw_cluster_cmd('osd', 'unset', 'noup')
+    manager.raw_cluster_cmd('osd', 'unset', 'nodown')
+
+    # write some new data
+    p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '20', 'write', '-b', '4096',
+                          '--no-cleanup'])
+
+    time.sleep(15)
+
+    # trigger a divergent target:
+    #  blackhole + restart osd.1 (shorter log)
+    manager.blackhole_kill_osd(1)
+    #  kill osd.2 (longer log... we'll make it divergent below)
+    manager.kill_osd(2)
+    time.sleep(2)
+    manager.revive_osd(1)
+
+    # wait for our writes to complete + succeed
+    err = p.wait()
+    log.info('err is %d' % err)
+
+    # cluster must repeer
+    manager.flush_pg_stats([0, 1])
+    manager.wait_for_active_or_down()
+
+    # write some more (make sure osd.2 really is divergent)
+    p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096'])
+    p.wait()
+
+    # revive divergent osd
+    manager.revive_osd(2)
+
+    while len(manager.get_osd_status()['up']) < 3:
+        log.info('waiting a bit...')
+        time.sleep(2)
+    log.info('3 are up!')
+
+    # cluster must recover
+    manager.flush_pg_stats([0, 1, 2])
+    manager.wait_for_clean()
+
+
+def test_incomplete_pgs(ctx, config):
+    """
+    Test handling of incomplete pgs.  Requires 4 osds.
+    """
+    testdir = teuthology.get_testdir(ctx)
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'task only accepts a dict for configuration'
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+
+    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+    log.info('num_osds is %s' % num_osds)
+    assert num_osds == 4
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    while len(manager.get_osd_status()['up']) < 4:
+        time.sleep(10)
+
+    manager.flush_pg_stats([0, 1, 2, 3])
+    manager.wait_for_clean()
+
+    log.info('Testing incomplete pgs...')
+
+    for i in range(4):
+        manager.set_config(
+            i,
+            osd_recovery_delay_start=1000)
+
+    # move data off of osd.0, osd.1
+    manager.raw_cluster_cmd('osd', 'out', '0', '1')
+    manager.flush_pg_stats([0, 1, 2, 3], [0, 1])
+    manager.wait_for_clean()
+
+    # lots of objects in rbd (no pg log, will backfill)
+    p = rados_start(testdir, mon,
+                    ['-p', 'rbd', 'bench', '20', 'write', '-b', '1',
+                     '--no-cleanup'])
+    p.wait()
+
+    # few objects in rbd pool (with pg log, normal recovery)
+    for f in range(1, 20):
+        p = rados_start(testdir, mon, ['-p', 'rbd', 'put',
+                              'foo.%d' % f, '/etc/passwd'])
+        p.wait()
+
+    # move it back
+    manager.raw_cluster_cmd('osd', 'in', '0', '1')
+    manager.raw_cluster_cmd('osd', 'out', '2', '3')
+    time.sleep(10)
+    manager.flush_pg_stats([0, 1, 2, 3], [2, 3])
+    time.sleep(10)
+    manager.wait_for_active()
+
+    assert not manager.is_clean()
+    assert not manager.is_recovered()
+
+    # kill 2 + 3
+    log.info('stopping 2,3')
+    manager.kill_osd(2)
+    manager.kill_osd(3)
+    log.info('...')
+    manager.raw_cluster_cmd('osd', 'down', '2', '3')
+    manager.flush_pg_stats([0, 1])
+    manager.wait_for_active_or_down()
+
+    assert manager.get_num_down() > 0
+
+    # revive 2 + 3
+    manager.revive_osd(2)
+    manager.revive_osd(3)
+    while len(manager.get_osd_status()['up']) < 4:
+        log.info('waiting a bit...')
+        time.sleep(2)
+    log.info('all are up!')
+
+    for i in range(4):
+        manager.kick_recovery_wq(i)
+
+    # cluster must recover
+    manager.wait_for_clean()
diff --git a/qa/tasks/peer.py b/qa/tasks/peer.py
new file mode 100644
index 000000000..6b19096b1
--- /dev/null
+++ b/qa/tasks/peer.py
@@ -0,0 +1,90 @@
+"""
+Peer test (Single test, not much configurable here)
+"""
+import logging
+import json
+import time
+
+from tasks import ceph_manager
+from tasks.util.rados import rados
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Test peering.
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'peer task only accepts a dict for configuration'
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+    manager.flush_pg_stats([0, 1, 2])
+    manager.wait_for_clean()
+
+    for i in range(3):
+        manager.set_config(
+            i,
+            osd_recovery_delay_start=120)
+
+    # take on osd down
+    manager.kill_osd(2)
+    manager.mark_down_osd(2)
+
+    # kludge to make sure they get a map
+    rados(ctx, mon, ['-p', 'data', 'get', 'dummy', '-'])
+
+    manager.flush_pg_stats([0, 1])
+    manager.wait_for_recovery()
+
+    # kill another and revive 2, so that some pgs can't peer.
+    manager.kill_osd(1)
+    manager.mark_down_osd(1)
+    manager.revive_osd(2)
+    manager.wait_till_osd_is_up(2)
+
+    manager.flush_pg_stats([0, 2])
+
+    manager.wait_for_active_or_down()
+
+    manager.flush_pg_stats([0, 2])
+
+    # look for down pgs
+    num_down_pgs = 0
+    pgs = manager.get_pg_stats()
+    for pg in pgs:
+        out = manager.raw_cluster_cmd('pg', pg['pgid'], 'query')
+        log.debug("out string %s",out)
+        j = json.loads(out)
+        log.info("pg is %s, query json is %s", pg, j)
+
+        if pg['state'].count('down'):
+            num_down_pgs += 1
+            # verify that it is blocked on osd.1
+            rs = j['recovery_state']
+            assert len(rs) >= 2
+            assert rs[0]['name'] == 'Started/Primary/Peering/Down'
+            assert rs[1]['name'] == 'Started/Primary/Peering'
+            assert rs[1]['blocked']
+            assert rs[1]['down_osds_we_would_probe'] == [1]
+            assert len(rs[1]['peering_blocked_by']) == 1
+            assert rs[1]['peering_blocked_by'][0]['osd'] == 1
+
+    assert num_down_pgs > 0
+
+    # bring it all back
+    manager.revive_osd(1)
+    manager.wait_till_osd_is_up(1)
+    manager.flush_pg_stats([0, 1, 2])
+    manager.wait_for_clean()
diff --git a/qa/tasks/peering_speed_test.py b/qa/tasks/peering_speed_test.py
new file mode 100644
index 000000000..9dc658361
--- /dev/null
+++ b/qa/tasks/peering_speed_test.py
@@ -0,0 +1,87 @@
+"""
+Remotely run peering tests.
+"""
+import logging
+import time
+
+log = logging.getLogger(__name__)
+
+from teuthology.task.args import argify
+
+POOLNAME = "POOLNAME"
+ARGS = [
+    ('num_pgs', 'number of pgs to create', 256, int),
+    ('max_time', 'seconds to complete peering', 0, int),
+    ('runs', 'trials to run', 10, int),
+    ('num_objects', 'objects to create', 256 * 1024, int),
+    ('object_size', 'size in bytes for objects', 64, int),
+    ('creation_time_limit', 'time limit for pool population', 60*60, int),
+    ('create_threads', 'concurrent writes for create', 256, int)
+    ]
+
+def setup(ctx, config):
+    """
+    Setup peering test on remotes.
+    """
+    manager = ctx.managers['ceph']
+    manager.clear_pools()
+    manager.create_pool(POOLNAME, config.num_pgs)
+    log.info("populating pool")
+    manager.rados_write_objects(
+        POOLNAME,
+        config.num_objects,
+        config.object_size,
+        config.creation_time_limit,
+        config.create_threads)
+    log.info("done populating pool")
+
+def do_run(ctx, config):
+    """
+    Perform the test.
+    """
+    start = time.time()
+    # mark in osd
+    manager = ctx.managers['ceph']
+    manager.mark_in_osd(0)
+    log.info("writing out objects")
+    manager.rados_write_objects(
+        POOLNAME,
+        config.num_pgs, # write 1 object per pg or so
+        1,
+        config.creation_time_limit,
+        config.num_pgs, # lots of concurrency
+        cleanup = True)
+    peering_end = time.time()
+
+    log.info("peering done, waiting on recovery")
+    manager.wait_for_clean()
+
+    log.info("recovery done")
+    recovery_end = time.time()
+    if config.max_time:
+        assert(peering_end - start < config.max_time)
+    manager.mark_out_osd(0)
+    manager.wait_for_clean()
+    return {
+        'time_to_active': peering_end - start,
+        'time_to_clean': recovery_end - start
+        }
+
+@argify("peering_speed_test", ARGS)
+def task(ctx, config):
+    """
+    Peering speed test
+    """
+    setup(ctx, config)
+    manager = ctx.managers['ceph']
+    manager.mark_out_osd(0)
+    manager.wait_for_clean()
+    ret = []
+    for i in range(config.runs):
+        log.info("Run {i}".format(i = i))
+        ret.append(do_run(ctx, config))
+
+    manager.mark_in_osd(0)
+    ctx.summary['recovery_times'] = {
+        'runs': ret
+        }
diff --git a/qa/tasks/populate_rbd_pool.py b/qa/tasks/populate_rbd_pool.py
new file mode 100644
index 000000000..76395eb68
--- /dev/null
+++ b/qa/tasks/populate_rbd_pool.py
@@ -0,0 +1,82 @@
+"""
+Populate rbd pools
+"""
+import contextlib
+import logging
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Populate <num_pools> pools with prefix <pool_prefix> with <num_images>
+    rbd images at <num_snaps> snaps
+
+    The config could be as follows::
+
+        populate_rbd_pool:
+          client: <client>
+          pool_prefix: foo
+          num_pools: 5
+          num_images: 10
+          num_snaps: 3
+          image_size: 10737418240
+    """
+    if config is None:
+        config = {}
+    client = config.get("client", "client.0")
+    pool_prefix = config.get("pool_prefix", "foo")
+    num_pools = config.get("num_pools", 2)
+    num_images = config.get("num_images", 20)
+    num_snaps = config.get("num_snaps", 4)
+    image_size = config.get("image_size", 100)
+    write_size = config.get("write_size", 1024*1024)
+    write_threads = config.get("write_threads", 10)
+    write_total_per_snap = config.get("write_total_per_snap", 1024*1024*30)
+
+    (remote,) = ctx.cluster.only(client).remotes.keys()
+
+    for poolid in range(num_pools):
+        poolname = "%s-%s" % (pool_prefix, str(poolid))
+        log.info("Creating pool %s" % (poolname,))
+        ctx.managers['ceph'].create_pool(poolname)
+        for imageid in range(num_images):
+            imagename = "rbd-%s" % (str(imageid),)
+            log.info("Creating imagename %s" % (imagename,))
+            remote.run(
+                args = [
+                    "rbd",
+                    "create",
+                    imagename,
+                    "--image-format", "1",
+                    "--size", str(image_size),
+                    "--pool", str(poolname)])
+            def bench_run():
+                remote.run(
+                    args = [
+                        "rbd",
+                        "bench-write",
+                        imagename,
+                        "--pool", poolname,
+                        "--io-size", str(write_size),
+                        "--io-threads", str(write_threads),
+                        "--io-total", str(write_total_per_snap),
+                        "--io-pattern", "rand"])
+            log.info("imagename %s first bench" % (imagename,))
+            bench_run()
+            for snapid in range(num_snaps):
+                snapname = "snap-%s" % (str(snapid),)
+                log.info("imagename %s creating snap %s" % (imagename, snapname))
+                remote.run(
+                    args = [
+                        "rbd", "snap", "create",
+                        "--pool", poolname,
+                        "--snap", snapname,
+                        imagename
+                        ])
+                bench_run()
+
+    try:
+        yield
+    finally:
+        log.info('done')
diff --git a/qa/tasks/pykmip.py b/qa/tasks/pykmip.py
new file mode 100644
index 000000000..45a5af689
--- /dev/null
+++ b/qa/tasks/pykmip.py
@@ -0,0 +1,465 @@
+"""
+Deploy and configure PyKMIP for Teuthology
+"""
+import argparse
+import contextlib
+import logging
+import time
+import tempfile
+import json
+import os
+from io import BytesIO
+from teuthology.orchestra.daemon import DaemonGroup
+from teuthology.orchestra.remote import Remote
+
+import pprint
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.orchestra import run
+from teuthology.packaging import install_package
+from teuthology.packaging import remove_package
+from teuthology.exceptions import ConfigError
+from tasks.util import get_remote_for_role
+
+log = logging.getLogger(__name__)
+
+
+def get_pykmip_dir(ctx):
+    return '{tdir}/pykmip'.format(tdir=teuthology.get_testdir(ctx))
+
+def run_in_pykmip_dir(ctx, client, args, **kwargs):
+    (remote,) = [client] if isinstance(client,Remote) else ctx.cluster.only(client).remotes.keys()
+    return remote.run(
+        args=['cd', get_pykmip_dir(ctx), run.Raw('&&'), ] + args,
+        **kwargs
+    )
+
+def run_in_pykmip_venv(ctx, client, args, **kwargs):
+    return run_in_pykmip_dir(ctx, client,
+        args = ['.', '.pykmipenv/bin/activate',
+                         run.Raw('&&')
+                        ] + args, **kwargs)
+
+@contextlib.contextmanager
+def download(ctx, config):
+    """
+    Download PyKMIP from github.
+    Remove downloaded file upon exit.
+
+    The context passed in should be identical to the context
+    passed in to the main task.
+    """
+    assert isinstance(config, dict)
+    log.info('Downloading pykmip...')
+    pykmipdir = get_pykmip_dir(ctx)
+
+    for (client, cconf) in config.items():
+        branch = cconf.get('force-branch', 'master')
+        repo = cconf.get('force-repo', 'https://github.com/OpenKMIP/PyKMIP')
+        sha1 = cconf.get('sha1')
+        log.info("Using branch '%s' for pykmip", branch)
+        log.info('sha1=%s', sha1)
+
+        ctx.cluster.only(client).run(
+            args=[
+                'git', 'clone', '-b', branch, repo,
+                pykmipdir,
+                ],
+            )
+        if sha1 is not None:
+            run_in_pykmip_dir(ctx, client, [
+                    'git', 'reset', '--hard', sha1,
+                ],
+            )
+    try:
+        yield
+    finally:
+        log.info('Removing pykmip...')
+        for client in config:
+            ctx.cluster.only(client).run(
+                args=[ 'rm', '-rf', pykmipdir ],
+            )
+
+_bindep_txt = """# should be part of PyKMIP
+libffi-dev [platform:dpkg]
+libffi-devel [platform:rpm]
+libssl-dev [platform:dpkg]
+openssl-devel [platform:redhat]
+libopenssl-devel [platform:suse]
+libsqlite3-dev [platform:dpkg]
+sqlite-devel [platform:rpm]
+python-dev [platform:dpkg]
+python-devel [(platform:redhat platform:base-py2)]
+python3-dev [platform:dpkg]
+python3-devel [(platform:redhat platform:base-py3) platform:suse]
+python3 [platform:suse]
+"""
+
+@contextlib.contextmanager
+def install_packages(ctx, config):
+    """
+    Download the packaged dependencies of PyKMIP.
+    Remove install packages upon exit.
+
+    The context passed in should be identical to the context
+    passed in to the main task.
+    """
+    assert isinstance(config, dict)
+    log.info('Installing system dependenies for PyKMIP...')
+
+    packages = {}
+    for (client, _) in config.items():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        # use bindep to read which dependencies we need from temp/bindep.txt
+        fd, local_temp_path = tempfile.mkstemp(suffix='.txt',
+                                               prefix='bindep-')
+        os.write(fd, _bindep_txt.encode())
+        os.close(fd)
+        fd, remote_temp_path = tempfile.mkstemp(suffix='.txt',
+                                               prefix='bindep-')
+        os.close(fd)
+        remote.put_file(local_temp_path, remote_temp_path)
+        os.remove(local_temp_path)
+        run_in_pykmip_venv(ctx, remote, ['pip', 'install', 'bindep'])
+        r = run_in_pykmip_venv(ctx, remote,
+                ['bindep', '--brief', '--file', remote_temp_path],
+                stdout=BytesIO(),
+                check_status=False) # returns 1 on success?
+        packages[client] = r.stdout.getvalue().decode().splitlines()
+        for dep in packages[client]:
+            install_package(dep, remote)
+    try:
+        yield
+    finally:
+        log.info('Removing system dependencies of PyKMIP...')
+
+        for (client, _) in config.items():
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+            for dep in packages[client]:
+                remove_package(dep, remote)
+
+@contextlib.contextmanager
+def setup_venv(ctx, config):
+    """
+    Setup the virtualenv for PyKMIP using pip.
+    """
+    assert isinstance(config, dict)
+    log.info('Setting up virtualenv for pykmip...')
+    for (client, _) in config.items():
+        run_in_pykmip_dir(ctx, client, ['python3', '-m', 'venv', '.pykmipenv'])
+        run_in_pykmip_venv(ctx, client, ['pip', 'install', '--upgrade', 'pip'])
+        run_in_pykmip_venv(ctx, client, ['pip', 'install', 'pytz', '-e', get_pykmip_dir(ctx)])
+    yield
+
+def assign_ports(ctx, config, initial_port):
+    """
+    Assign port numbers starting from @initial_port
+    """
+    port = initial_port
+    role_endpoints = {}
+    for remote, roles_for_host in ctx.cluster.remotes.items():
+        for role in roles_for_host:
+            if role in config:
+                r = get_remote_for_role(ctx, role)
+                role_endpoints[role] = r.ip_address, port, r.hostname
+                port += 1
+
+    return role_endpoints
+
+def copy_policy_json(ctx, cclient, cconfig):
+    run_in_pykmip_dir(ctx, cclient,
+                        ['cp',
+                         get_pykmip_dir(ctx)+'/examples/policy.json',
+                         get_pykmip_dir(ctx)])
+
+_pykmip_configuration = """# configuration for pykmip
+[server]
+hostname={ipaddr}
+port={port}
+certificate_path={servercert}
+key_path={serverkey}
+ca_path={clientca}
+auth_suite=TLS1.2
+policy_path={confdir}
+enable_tls_client_auth=False
+tls_cipher_suites=
+    TLS_RSA_WITH_AES_128_CBC_SHA256
+    TLS_RSA_WITH_AES_256_CBC_SHA256
+    TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384
+logging_level=DEBUG
+database_path={confdir}/pykmip.sqlite
+[client]
+host={hostname}
+port=5696
+certfile={clientcert}
+keyfile={clientkey}
+ca_certs={clientca}
+ssl_version=PROTOCOL_TLSv1_2
+"""
+
+def create_pykmip_conf(ctx, cclient, cconfig):
+    log.info('#0 cclient={} cconfig={}'.format(pprint.pformat(cclient),pprint.pformat(cconfig)))
+    (remote,) = ctx.cluster.only(cclient).remotes.keys()
+    pykmip_ipaddr, pykmip_port, pykmip_hostname = ctx.pykmip.endpoints[cclient]
+    log.info('#1 ip,p,h {} {} {}'.format(pykmip_ipaddr, pykmip_port, pykmip_hostname))
+    clientca = cconfig.get('clientca', None)
+    log.info('#2 clientca {}'.format(clientca))
+    serverkey = None
+    servercert = cconfig.get('servercert', None)
+    log.info('#3 servercert {}'.format(servercert))
+    servercert = ctx.ssl_certificates.get(servercert)
+    log.info('#4 servercert {}'.format(servercert))
+    clientkey = None
+    clientcert = cconfig.get('clientcert', None)
+    log.info('#3 clientcert {}'.format(clientcert))
+    clientcert = ctx.ssl_certificates.get(clientcert)
+    log.info('#4 clientcert {}'.format(clientcert))
+    clientca = ctx.ssl_certificates.get(clientca)
+    log.info('#5 clientca {}'.format(clientca))
+    if servercert != None:
+      serverkey = servercert.key
+      servercert = servercert.certificate
+      log.info('#6 serverkey {} servercert {}'.format(serverkey, servercert))
+    if clientcert != None:
+      clientkey = clientcert.key
+      clientcert = clientcert.certificate
+      log.info('#6 clientkey {} clientcert {}'.format(clientkey, clientcert))
+    if clientca != None:
+      clientca = clientca.certificate
+      log.info('#7 clientca {}'.format(clientca))
+    if servercert == None or clientca == None or serverkey == None:
+      log.info('#8 clientca {} serverkey {} servercert {}'.format(clientca, serverkey, servercert))
+      raise ConfigError('pykmip: Missing/bad servercert or clientca')
+    pykmipdir = get_pykmip_dir(ctx)
+    kmip_conf = _pykmip_configuration.format(
+        ipaddr=pykmip_ipaddr,
+        port=pykmip_port,
+        confdir=pykmipdir,
+        hostname=pykmip_hostname,
+        clientca=clientca,
+        clientkey=clientkey,
+        clientcert=clientcert,
+        serverkey=serverkey,
+        servercert=servercert
+    )
+    fd, local_temp_path = tempfile.mkstemp(suffix='.conf',
+                                           prefix='pykmip')
+    os.write(fd, kmip_conf.encode())
+    os.close(fd)
+    remote.put_file(local_temp_path, pykmipdir+'/pykmip.conf')
+    os.remove(local_temp_path)
+
+@contextlib.contextmanager
+def configure_pykmip(ctx, config):
+    """
+    Configure pykmip paste-api and pykmip-api.
+    """
+    assert isinstance(config, dict)
+    (cclient, cconfig) = next(iter(config.items()))
+
+    copy_policy_json(ctx, cclient, cconfig)
+    create_pykmip_conf(ctx, cclient, cconfig)
+    try:
+        yield
+    finally:
+        pass
+
+def has_ceph_task(tasks):
+    for task in tasks:
+        for name, conf in task.items():
+            if name == 'ceph':
+                return True
+    return False
+
+@contextlib.contextmanager
+def run_pykmip(ctx, config):
+    assert isinstance(config, dict)
+    if hasattr(ctx, 'daemons'):
+        pass
+    elif has_ceph_task(ctx.config['tasks']):
+        log.info('Delay start pykmip so ceph can do once-only daemon logic')
+        try:
+            yield
+        finally:
+            pass
+    else:
+        ctx.daemons = DaemonGroup()
+    log.info('Running pykmip...')
+
+    pykmipdir = get_pykmip_dir(ctx)
+
+    for (client, _) in config.items():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        cluster_name, _, client_id = teuthology.split_role(client)
+
+        # start the public endpoint
+        client_public_with_id = 'pykmip.public' + '.' + client_id
+
+        run_cmd = 'cd ' + pykmipdir + ' && ' + \
+                  '. .pykmipenv/bin/activate && ' + \
+                  'HOME={}'.format(pykmipdir) + ' && ' + \
+                  'exec pykmip-server -f pykmip.conf -l ' + \
+                  pykmipdir + '/pykmip.log & { read; kill %1; }'
+
+        ctx.daemons.add_daemon(
+            remote, 'pykmip', client_public_with_id,
+            cluster=cluster_name,
+            args=['bash', '-c', run_cmd],
+            logger=log.getChild(client),
+            stdin=run.PIPE,
+            cwd=pykmipdir,
+            wait=False,
+            check_status=False,
+        )
+
+        # sleep driven synchronization
+        time.sleep(10)
+    try:
+        yield
+    finally:
+        log.info('Stopping PyKMIP instance')
+        ctx.daemons.get_daemon('pykmip', client_public_with_id,
+                               cluster_name).stop()
+
+make_keys_template = """
+from kmip.pie import client
+from kmip import enums
+import ssl
+import sys
+import json
+from io import BytesIO
+
+c = client.ProxyKmipClient(config_file="{replace-with-config-file-path}")
+
+rl=[]
+for kwargs in {replace-with-secrets}:
+ with c:
+  key_id = c.create(
+   enums.CryptographicAlgorithm.AES,
+   256,
+   operation_policy_name='default',
+   cryptographic_usage_mask=[
+    enums.CryptographicUsageMask.ENCRYPT,
+    enums.CryptographicUsageMask.DECRYPT
+   ],
+   **kwargs
+  )
+  c.activate(key_id)
+  attrs = c.get_attributes(uid=key_id)
+  r = {}
+  for a in attrs[1]:
+   r[str(a.attribute_name)] = str(a.attribute_value)
+  rl.append(r)
+print(json.dumps(rl))
+"""
+
+@contextlib.contextmanager
+def create_secrets(ctx, config):
+    """
+    Create and activate any requested keys in kmip
+    """
+    assert isinstance(config, dict)
+
+    pykmipdir = get_pykmip_dir(ctx)
+    pykmip_conf_path = pykmipdir + '/pykmip.conf'
+    my_output = BytesIO()
+    for (client,cconf) in config.items():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        secrets=cconf.get('secrets')
+        if secrets:
+            secrets_json = json.dumps(cconf['secrets'])
+            make_keys = make_keys_template \
+                .replace("{replace-with-secrets}",secrets_json) \
+                .replace("{replace-with-config-file-path}",pykmip_conf_path)
+            my_output.truncate()
+            remote.run(args=[run.Raw('. cephtest/pykmip/.pykmipenv/bin/activate;' \
+                + 'python')], stdin=make_keys, stdout = my_output)
+            ctx.pykmip.keys[client] = json.loads(my_output.getvalue().decode())
+    try:
+        yield
+    finally:
+        pass
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Deploy and configure PyKMIP
+
+    Example of configuration:
+
+    tasks:
+    - install:
+    - ceph:
+       conf:
+        client:
+         rgw crypt s3 kms backend: kmip
+         rgw crypt kmip ca path: /home/ubuntu/cephtest/ca/kmiproot.crt
+         rgw crypt kmip client cert: /home/ubuntu/cephtest/ca/kmip-client.crt
+         rgw crypt kmip client key: /home/ubuntu/cephtest/ca/kmip-client.key
+         rgw crypt kmip kms key template: pykmip-$keyid
+    - openssl_keys:
+       kmiproot:
+         client: client.0
+         cn: kmiproot
+         key-type: rsa:4096
+    - openssl_keys:
+       kmip-server:
+         client: client.0
+         ca: kmiproot
+       kmip-client:
+         client: client.0
+         ca: kmiproot
+         cn: rgw-client
+    - pykmip:
+        client.0:
+          force-branch: master
+          clientca: kmiproot
+          servercert: kmip-server
+          clientcert: kmip-client
+          secrets:
+          - name: pykmip-key-1
+          - name: pykmip-key-2
+    - rgw:
+        client.0:
+          use-pykmip-role: client.0
+    - s3tests:
+        client.0:
+          force-branch: master
+    """
+    assert config is None or isinstance(config, list) \
+        or isinstance(config, dict), \
+        "task pykmip only supports a list or dictionary for configuration"
+    all_clients = ['client.{id}'.format(id=id_)
+                   for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    if config is None:
+        config = all_clients
+    if isinstance(config, list):
+        config = dict.fromkeys(config)
+
+    overrides = ctx.config.get('overrides', {})
+    # merge each client section, not the top level.
+    for client in config.keys():
+        if not config[client]:
+            config[client] = {}
+        teuthology.deep_merge(config[client], overrides.get('pykmip', {}))
+
+    log.debug('PyKMIP config is %s', config)
+
+    if not hasattr(ctx, 'ssl_certificates'):
+        raise ConfigError('pykmip must run after the openssl_keys task')
+
+
+    ctx.pykmip = argparse.Namespace()
+    ctx.pykmip.endpoints = assign_ports(ctx, config, 5696)
+    ctx.pykmip.keys = {}
+    
+    with contextutil.nested(
+        lambda: download(ctx=ctx, config=config),
+        lambda: setup_venv(ctx=ctx, config=config),
+        lambda: install_packages(ctx=ctx, config=config),
+        lambda: configure_pykmip(ctx=ctx, config=config),
+        lambda: run_pykmip(ctx=ctx, config=config),
+        lambda: create_secrets(ctx=ctx, config=config),
+        ):
+        yield
diff --git a/qa/tasks/python.py b/qa/tasks/python.py
new file mode 100644
index 000000000..4ddb14f71
--- /dev/null
+++ b/qa/tasks/python.py
@@ -0,0 +1,45 @@
+import logging
+from teuthology import misc as teuthology
+from tasks.vip import subst_vip
+
+log = logging.getLogger(__name__)
+
+
+def task(ctx, config):
+    """
+    Execute some python code.
+ 
+      tasks:
+      - python:
+          host.a: |
+            import boto3
+            c = boto3.resource(...)
+
+    The provided dict is normally indexed by role.  You can also include a 
+    'sudo: false' key to run the code without sudo.
+
+      tasks:
+      - python:
+          sudo: false
+          host.b: |
+            import boto3
+            c = boto3.resource(...)
+    """
+    assert isinstance(config, dict), "task python got invalid config"
+
+    testdir = teuthology.get_testdir(ctx)
+
+    sudo = config.pop('sudo', True)
+
+    for role, code in config.items():
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+        log.info('Running python on role %s host %s', role, remote.name)
+        log.info(code)
+        args=[
+            'TESTDIR={tdir}'.format(tdir=testdir),
+            'python3',
+        ]
+        if sudo:
+            args = ['sudo'] + args
+        remote.run(args=args, stdin=subst_vip(ctx, code))
+
diff --git a/qa/tasks/qemu.py b/qa/tasks/qemu.py
new file mode 100644
index 000000000..d4802d0cd
--- /dev/null
+++ b/qa/tasks/qemu.py
@@ -0,0 +1,684 @@
+"""
+Qemu task
+"""
+
+import contextlib
+import logging
+import os
+import yaml
+import time
+
+from tasks import rbd
+from tasks.util.workunit import get_refspec_after_overrides
+from teuthology import contextutil
+from teuthology import misc as teuthology
+from teuthology.config import config as teuth_config
+from teuthology.orchestra import run
+from teuthology.packaging import install_package, remove_package
+
+log = logging.getLogger(__name__)
+
+DEFAULT_NUM_DISKS = 2
+DEFAULT_IMAGE_URL = 'http://download.ceph.com/qa/ubuntu-12.04.qcow2'
+DEFAULT_IMAGE_SIZE = 10240 # in megabytes
+ENCRYPTION_HEADER_SIZE = 16 # in megabytes
+DEFAULT_CPUS = 1
+DEFAULT_MEM = 4096 # in megabytes
+
+def normalize_disks(config):
+    # normalize the 'disks' parameter into a list of dictionaries
+    for client, client_config in config.items():
+        clone = client_config.get('clone', False)
+        image_url = client_config.get('image_url', DEFAULT_IMAGE_URL)
+        device_type = client_config.get('type', 'filesystem')
+        encryption_format = client_config.get('encryption_format', 'none')
+
+        disks = client_config.get('disks', DEFAULT_NUM_DISKS)
+        if not isinstance(disks, list):
+            disks = [{'image_name': '{client}.{num}'.format(client=client,
+                                                            num=i)}
+                     for i in range(int(disks))]
+            client_config['disks'] = disks
+
+        for i, disk in enumerate(disks):
+            if 'action' not in disk:
+                disk['action'] = 'create'
+            assert disk['action'] in ['none', 'create', 'clone'], 'invalid disk action'
+            assert disk['action'] != 'clone' or 'parent_name' in disk, 'parent_name required for clone'
+
+            if 'image_size' not in disk:
+                disk['image_size'] = DEFAULT_IMAGE_SIZE
+            disk['image_size'] = int(disk['image_size'])
+
+            if 'image_url' not in disk and i == 0:
+                disk['image_url'] = image_url
+
+            if 'device_type' not in disk:
+                disk['device_type'] = device_type
+
+            disk['device_letter'] = chr(ord('a') + i)
+
+            if 'encryption_format' not in disk:
+                disk['encryption_format'] = encryption_format
+            assert disk['encryption_format'] in ['none', 'luks1', 'luks2'], 'invalid encryption format'
+
+        assert disks, 'at least one rbd device must be used'
+
+        if clone:
+            for disk in disks:
+                if disk['action'] != 'create':
+                    continue
+                clone = dict(disk)
+                clone['action'] = 'clone'
+                clone['parent_name'] = clone['image_name']
+                clone['image_name'] += '-clone'
+                del disk['device_letter']
+                disks.append(clone)
+
+def create_images(ctx, config, managers):
+    for client, client_config in config.items():
+        disks = client_config['disks']
+        for disk in disks:
+            if disk.get('action') != 'create' or (
+                    'image_url' in disk and
+                    disk['encryption_format'] == 'none'):
+                continue
+            image_size = disk['image_size']
+            if disk['encryption_format'] != 'none':
+                image_size += ENCRYPTION_HEADER_SIZE
+            create_config = {
+                client: {
+                    'image_name': disk['image_name'],
+                    'image_format': 2,
+                    'image_size': image_size,
+                    'encryption_format': disk['encryption_format'],
+                    }
+                }
+            managers.append(
+                lambda create_config=create_config:
+                rbd.create_image(ctx=ctx, config=create_config)
+                )
+
+def create_clones(ctx, config, managers):
+    for client, client_config in config.items():
+        disks = client_config['disks']
+        for disk in disks:
+            if disk['action'] != 'clone':
+                continue
+
+            create_config = {
+                client: {
+                    'image_name': disk['image_name'],
+                    'parent_name': disk['parent_name']
+                    }
+                }
+            managers.append(
+                lambda create_config=create_config:
+                rbd.clone_image(ctx=ctx, config=create_config)
+                )
+
+def create_encrypted_devices(ctx, config, managers):
+    for client, client_config in config.items():
+        disks = client_config['disks']
+        for disk in disks:
+            if disk['encryption_format'] == 'none' or \
+                    'device_letter' not in disk:
+                continue
+
+            dev_config = {client: disk}
+            managers.append(
+                lambda dev_config=dev_config:
+                rbd.dev_create(ctx=ctx, config=dev_config)
+                )
+
+@contextlib.contextmanager
+def create_dirs(ctx, config):
+    """
+    Handle directory creation and cleanup
+    """
+    testdir = teuthology.get_testdir(ctx)
+    for client, client_config in config.items():
+        assert 'test' in client_config, 'You must specify a test to run'
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        remote.run(
+            args=[
+                'install', '-d', '-m0755', '--',
+                '{tdir}/qemu'.format(tdir=testdir),
+                '{tdir}/archive/qemu'.format(tdir=testdir),
+                ]
+            )
+    try:
+        yield
+    finally:
+        for client, client_config in config.items():
+            assert 'test' in client_config, 'You must specify a test to run'
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+            remote.run(
+                args=[
+                    'rmdir', '{tdir}/qemu'.format(tdir=testdir), run.Raw('||'), 'true',
+                    ]
+                )
+
+@contextlib.contextmanager
+def install_block_rbd_driver(ctx, config):
+    """
+    Make sure qemu rbd block driver (block-rbd.so) is installed
+    """
+    for client, client_config in config.items():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        if remote.os.package_type == 'rpm':
+            block_rbd_pkg = 'qemu-kvm-block-rbd'
+        else:
+            block_rbd_pkg = 'qemu-block-extra'
+        install_package(block_rbd_pkg, remote)
+    try:
+        yield
+    finally:
+        for client, client_config in config.items():
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+            remove_package(block_rbd_pkg, remote)
+
+@contextlib.contextmanager
+def generate_iso(ctx, config):
+    """Execute system commands to generate iso"""
+    log.info('generating iso...')
+    testdir = teuthology.get_testdir(ctx)
+
+    # use ctx.config instead of config, because config has been
+    # through teuthology.replace_all_with_clients()
+    refspec = get_refspec_after_overrides(ctx.config, {})
+
+    git_url = teuth_config.get_ceph_qa_suite_git_url()
+    log.info('Pulling tests from %s ref %s', git_url, refspec)
+
+    for client, client_config in config.items():
+        assert 'test' in client_config, 'You must specify a test to run'
+        test = client_config['test']
+
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+
+        clone_dir = '{tdir}/qemu_clone.{role}'.format(tdir=testdir, role=client)
+        remote.run(args=refspec.clone(git_url, clone_dir))
+
+        src_dir = os.path.dirname(__file__)
+        userdata_path = os.path.join(testdir, 'qemu', 'userdata.' + client)
+        metadata_path = os.path.join(testdir, 'qemu', 'metadata.' + client)
+
+        with open(os.path.join(src_dir, 'userdata_setup.yaml')) as f:
+            test_setup = ''.join(f.readlines())
+            # configuring the commands to setup the nfs mount
+            mnt_dir = "/export/{client}".format(client=client)
+            test_setup = test_setup.format(
+                mnt_dir=mnt_dir
+            )
+
+        with open(os.path.join(src_dir, 'userdata_teardown.yaml')) as f:
+            test_teardown = ''.join(f.readlines())
+
+        user_data = test_setup
+
+        disks = client_config['disks']
+        for disk in disks:
+            if disk['device_type'] != 'filesystem' or \
+                    'device_letter' not in disk or \
+                    'image_url' in disk:
+                continue
+            dev_letter = disk['device_letter']
+            user_data += """
+- |
+  #!/bin/bash
+  mkdir /mnt/test_{dev_letter}
+  mkfs -t xfs /dev/vd{dev_letter}
+  mount -t xfs /dev/vd{dev_letter} /mnt/test_{dev_letter}
+""".format(dev_letter=dev_letter)
+
+        user_data += """
+- |
+  #!/bin/bash
+  test -d /etc/ceph || mkdir /etc/ceph
+  cp /mnt/cdrom/ceph.* /etc/ceph/
+"""
+
+        cloud_config_archive = client_config.get('cloud_config_archive', [])
+        if cloud_config_archive:
+          user_data += yaml.safe_dump(cloud_config_archive, default_style='|',
+                                      default_flow_style=False)
+
+        # this may change later to pass the directories as args to the
+        # script or something. xfstests needs that.
+        user_data += """
+- |
+  #!/bin/bash
+  test -d /mnt/test_b && cd /mnt/test_b
+  /mnt/cdrom/test.sh > /mnt/log/test.log 2>&1 && touch /mnt/log/success
+""" + test_teardown
+
+        user_data = user_data.format(
+            ceph_branch=ctx.config.get('branch'),
+            ceph_sha1=ctx.config.get('sha1'))
+        remote.write_file(userdata_path, user_data)
+
+        with open(os.path.join(src_dir, 'metadata.yaml'), 'rb') as f:
+            remote.write_file(metadata_path, f)
+
+        test_file = '{tdir}/qemu/{client}.test.sh'.format(tdir=testdir, client=client)
+
+        log.info('fetching test %s for %s', test, client)
+        remote.run(
+            args=[
+                'cp', '--', os.path.join(clone_dir, test), test_file,
+                run.Raw('&&'),
+                'chmod', '755', test_file,
+                ],
+            )
+        remote.run(
+            args=[
+                'genisoimage', '-quiet', '-input-charset', 'utf-8',
+                '-volid', 'cidata', '-joliet', '-rock',
+                '-o', '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client),
+                '-graft-points',
+                'user-data={userdata}'.format(userdata=userdata_path),
+                'meta-data={metadata}'.format(metadata=metadata_path),
+                'ceph.conf=/etc/ceph/ceph.conf',
+                'ceph.keyring=/etc/ceph/ceph.keyring',
+                'test.sh={file}'.format(file=test_file),
+                ],
+            )
+    try:
+        yield
+    finally:
+        for client in config.keys():
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+            remote.run(
+                args=[
+                    'rm', '-rf',
+                    '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client),
+                    os.path.join(testdir, 'qemu', 'userdata.' + client),
+                    os.path.join(testdir, 'qemu', 'metadata.' + client),
+                    '{tdir}/qemu/{client}.test.sh'.format(tdir=testdir, client=client),
+                    '{tdir}/qemu_clone.{client}'.format(tdir=testdir, client=client),
+                    ],
+                )
+
+@contextlib.contextmanager
+def download_image(ctx, config):
+    """Downland base image, remove image file when done"""
+    log.info('downloading base image')
+    testdir = teuthology.get_testdir(ctx)
+
+    client_base_files = {}
+    for client, client_config in config.items():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+
+        client_base_files[client] = []
+        disks = client_config['disks']
+        for disk in disks:
+            if disk['action'] != 'create' or 'image_url' not in disk:
+                continue
+
+            base_file = '{tdir}/qemu/base.{name}.qcow2'.format(tdir=testdir,
+                                                               name=disk['image_name'])
+            client_base_files[client].append(base_file)
+
+            remote.run(
+                args=[
+                    'wget', '-nv', '-O', base_file, disk['image_url'],
+                    ]
+                )
+
+            if disk['encryption_format'] == 'none':
+                remote.run(
+                    args=[
+                        'qemu-img', 'convert', '-f', 'qcow2', '-O', 'raw',
+                        base_file, 'rbd:rbd/{image_name}'.format(image_name=disk['image_name'])
+                        ]
+                    )
+            else:
+                dev_config = {client: {'image_name': disk['image_name'],
+                                       'encryption_format': disk['encryption_format']}}
+                raw_file = '{tdir}/qemu/base.{name}.raw'.format(
+                    tdir=testdir, name=disk['image_name'])
+                client_base_files[client].append(raw_file)
+                remote.run(
+                    args=[
+                        'qemu-img', 'convert', '-f', 'qcow2', '-O', 'raw',
+                        base_file, raw_file
+                        ]
+                    )
+                with rbd.dev_create(ctx, dev_config):
+                    remote.run(
+                        args=[
+                            'dd', 'if={name}'.format(name=raw_file),
+                            'of={name}'.format(name=dev_config[client]['device_path']),
+                            'bs=4M', 'conv=fdatasync'
+                            ]
+                        )
+
+        for disk in disks:
+            if disk['action'] == 'clone' or \
+                    disk['encryption_format'] != 'none' or \
+                    (disk['action'] == 'create' and 'image_url' not in disk):
+                continue
+
+            remote.run(
+                args=[
+                    'rbd', 'resize',
+                    '--size={image_size}M'.format(image_size=disk['image_size']),
+                    disk['image_name'], run.Raw('||'), 'true'
+                    ]
+                )
+
+    try:
+        yield
+    finally:
+        log.debug('cleaning up base image files')
+        for client, base_files in client_base_files.items():
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+            for base_file in base_files:
+                remote.run(
+                    args=[
+                        'rm', '-f', base_file,
+                        ],
+                    )
+
+
+def _setup_nfs_mount(remote, client, service_name, mount_dir):
+    """
+    Sets up an nfs mount on the remote that the guest can use to
+    store logs. This nfs mount is also used to touch a file
+    at the end of the test to indicate if the test was successful
+    or not.
+    """
+    export_dir = "/export/{client}".format(client=client)
+    log.info("Creating the nfs export directory...")
+    remote.run(args=[
+        'sudo', 'mkdir', '-p', export_dir,
+    ])
+    log.info("Mounting the test directory...")
+    remote.run(args=[
+        'sudo', 'mount', '--bind', mount_dir, export_dir,
+    ])
+    log.info("Adding mount to /etc/exports...")
+    export = "{dir} *(rw,no_root_squash,no_subtree_check,insecure)".format(
+        dir=export_dir
+    )
+    log.info("Deleting export from /etc/exports...")
+    remote.run(args=[
+        'sudo', 'sed', '-i', "\|{export_dir}|d".format(export_dir=export_dir),
+        '/etc/exports'
+    ])
+    remote.run(args=[
+        'echo', export, run.Raw("|"),
+        'sudo', 'tee', '-a', "/etc/exports",
+    ])
+    log.info("Restarting NFS...")
+    if remote.os.package_type == "deb":
+        remote.run(args=['sudo', 'service', 'nfs-kernel-server', 'restart'])
+    else:
+        remote.run(args=['sudo', 'systemctl', 'restart', service_name])
+
+
+def _teardown_nfs_mount(remote, client, service_name):
+    """
+    Tears down the nfs mount on the remote used for logging and reporting the
+    status of the tests being ran in the guest.
+    """
+    log.info("Tearing down the nfs mount for {remote}".format(remote=remote))
+    export_dir = "/export/{client}".format(client=client)
+    log.info("Stopping NFS...")
+    if remote.os.package_type == "deb":
+        remote.run(args=[
+            'sudo', 'service', 'nfs-kernel-server', 'stop'
+        ])
+    else:
+        remote.run(args=[
+            'sudo', 'systemctl', 'stop', service_name
+        ])
+    log.info("Unmounting exported directory...")
+    remote.run(args=[
+        'sudo', 'umount', export_dir
+    ])
+    log.info("Deleting export from /etc/exports...")
+    remote.run(args=[
+        'sudo', 'sed', '-i', "\|{export_dir}|d".format(export_dir=export_dir),
+        '/etc/exports'
+    ])
+    log.info("Starting NFS...")
+    if remote.os.package_type == "deb":
+        remote.run(args=[
+            'sudo', 'service', 'nfs-kernel-server', 'start'
+        ])
+    else:
+        remote.run(args=[
+            'sudo', 'systemctl', 'start', service_name
+        ])
+
+
+@contextlib.contextmanager
+def run_qemu(ctx, config):
+    """Setup kvm environment and start qemu"""
+    procs = []
+    testdir = teuthology.get_testdir(ctx)
+    for client, client_config in config.items():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        log_dir = '{tdir}/archive/qemu/{client}'.format(tdir=testdir, client=client)
+        remote.run(
+            args=[
+                'mkdir', log_dir, run.Raw('&&'),
+                'sudo', 'modprobe', 'kvm',
+                ]
+            )
+
+        nfs_service_name = 'nfs'
+        if remote.os.name in ['rhel', 'centos'] and float(remote.os.version) >= 8:
+            nfs_service_name = 'nfs-server'
+
+        # make an nfs mount to use for logging and to
+        # allow to test to tell teuthology the tests outcome
+        _setup_nfs_mount(remote, client, nfs_service_name, log_dir)
+
+        # Hack to make sure /dev/kvm permissions are set correctly
+        # See http://tracker.ceph.com/issues/17977 and
+        # https://bugzilla.redhat.com/show_bug.cgi?id=1333159
+        remote.run(args='sudo udevadm control --reload')
+        remote.run(args='sudo udevadm trigger /dev/kvm')
+        remote.run(args='ls -l /dev/kvm')
+
+        qemu_cmd = 'qemu-system-x86_64'
+        if remote.os.package_type == "rpm":
+            qemu_cmd = "/usr/libexec/qemu-kvm"
+        args=[
+            'adjust-ulimits',
+            'ceph-coverage',
+            '{tdir}/archive/coverage'.format(tdir=testdir),
+            'daemon-helper',
+            'term',
+            qemu_cmd, '-enable-kvm', '-nographic', '-cpu', 'host',
+            '-smp', str(client_config.get('cpus', DEFAULT_CPUS)),
+            '-m', str(client_config.get('memory', DEFAULT_MEM)),
+            # cd holding metadata for cloud-init
+            '-cdrom', '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client),
+            ]
+
+        cachemode = 'none'
+        ceph_config = ctx.ceph['ceph'].conf.get('global', {})
+        ceph_config.update(ctx.ceph['ceph'].conf.get('client', {}))
+        ceph_config.update(ctx.ceph['ceph'].conf.get(client, {}))
+        if ceph_config.get('rbd cache', True):
+            if ceph_config.get('rbd cache max dirty', 1) > 0:
+                cachemode = 'writeback'
+            else:
+                cachemode = 'writethrough'
+
+        disks = client_config['disks']
+        for disk in disks:
+            if 'device_letter' not in disk:
+                continue
+
+            if disk['encryption_format'] == 'none':
+                disk_spec = 'rbd:rbd/{img}:id={id}'.format(
+                    img=disk['image_name'],
+                    id=client[len('client.'):]
+                    )
+            else:
+                disk_spec = disk['device_path']
+
+            args.extend([
+                '-drive',
+                'file={disk_spec},format=raw,if=virtio,cache={cachemode}'.format(
+                    disk_spec=disk_spec,
+                    cachemode=cachemode,
+                    ),
+                ])
+        time_wait = client_config.get('time_wait', 0)
+
+        log.info('starting qemu...')
+        procs.append(
+            remote.run(
+                args=args,
+                logger=log.getChild(client),
+                stdin=run.PIPE,
+                wait=False,
+                )
+            )
+
+    try:
+        yield
+    finally:
+        log.info('waiting for qemu tests to finish...')
+        run.wait(procs)
+
+        if time_wait > 0:
+            log.debug('waiting {time_wait} sec for workloads detect finish...'.format(
+                time_wait=time_wait));
+            time.sleep(time_wait)
+
+        log.debug('checking that qemu tests succeeded...')
+        for client in config.keys():
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+
+            # ensure we have permissions to all the logs
+            log_dir = '{tdir}/archive/qemu/{client}'.format(tdir=testdir,
+                                                            client=client)
+            remote.run(
+                args=[
+                    'sudo', 'chmod', 'a+rw', '-R', log_dir
+                    ]
+                )
+
+            # teardown nfs mount
+            _teardown_nfs_mount(remote, client, nfs_service_name)
+            # check for test status
+            remote.run(
+                args=[
+                    'test', '-f',
+                    '{tdir}/archive/qemu/{client}/success'.format(
+                        tdir=testdir,
+                        client=client
+                        ),
+                    ],
+                )
+        log.info("Deleting exported directory...")
+        for client in config.keys():
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+            remote.run(args=[
+                'sudo', 'rm', '-r', '/export'
+            ])
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run a test inside of QEMU on top of rbd. Only one test
+    is supported per client.
+
+    For example, you can specify which clients to run on::
+
+        tasks:
+        - ceph:
+        - qemu:
+            client.0:
+              test: http://download.ceph.com/qa/test.sh
+            client.1:
+              test: http://download.ceph.com/qa/test2.sh
+
+    Or use the same settings on all clients:
+
+        tasks:
+        - ceph:
+        - qemu:
+            all:
+              test: http://download.ceph.com/qa/test.sh
+
+    For tests that want to explicitly describe the RBD images to connect:
+
+        tasks:
+        - ceph:
+        - qemu:
+            client.0:
+                test: http://download.ceph.com/qa/test.sh
+                clone: True/False (optionally clone all created disks),
+                image_url: <URL> (optional default image URL)
+                type: filesystem / block (optional default device type)
+                disks: [
+                    {
+                        action: create / clone / none (optional, defaults to create)
+                        image_name: <image name> (optional)
+                        parent_name: <parent_name> (if action == clone),
+                        type: filesystem / block (optional, defaults to fileystem)
+                        image_url: <URL> (optional),
+                        image_size: <MiB> (optional)
+                        encryption_format: luks1 / luks2 / none (optional, defaults to none)
+                    }, ...
+                ]
+
+    You can set the amount of CPUs and memory the VM has (default is 1 CPU and
+    4096 MB)::
+
+        tasks:
+        - ceph:
+        - qemu:
+            client.0:
+              test: http://download.ceph.com/qa/test.sh
+              cpus: 4
+              memory: 512 # megabytes
+
+    If you need to configure additional cloud-config options, set cloud_config
+    to the required data set::
+
+        tasks:
+        - ceph
+        - qemu:
+            client.0:
+                test: http://ceph.com/qa/test.sh
+                cloud_config_archive:
+                    - |
+                      #/bin/bash
+                      touch foo1
+                    - content: |
+                        test data
+                      type: text/plain
+                      filename: /tmp/data
+    """
+    assert isinstance(config, dict), \
+           "task qemu only supports a dictionary for configuration"
+
+    config = teuthology.replace_all_with_clients(ctx.cluster, config)
+    normalize_disks(config)
+
+    managers = []
+    create_images(ctx=ctx, config=config, managers=managers)
+    managers.extend([
+        lambda: create_dirs(ctx=ctx, config=config),
+        lambda: install_block_rbd_driver(ctx=ctx, config=config),
+        lambda: generate_iso(ctx=ctx, config=config),
+        lambda: download_image(ctx=ctx, config=config),
+        ])
+    create_clones(ctx=ctx, config=config, managers=managers)
+    create_encrypted_devices(ctx=ctx, config=config, managers=managers)
+    managers.append(
+        lambda: run_qemu(ctx=ctx, config=config),
+        )
+
+    with contextutil.nested(*managers):
+        yield
diff --git a/qa/tasks/rados.py b/qa/tasks/rados.py
new file mode 100644
index 000000000..349f46efc
--- /dev/null
+++ b/qa/tasks/rados.py
@@ -0,0 +1,277 @@
+"""
+Rados modle-based integration tests
+"""
+import contextlib
+import logging
+import gevent
+from teuthology import misc as teuthology
+
+
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run RadosModel-based integration tests.
+
+    The config should be as follows::
+
+        rados:
+          clients: [client list]
+          ops: <number of ops>
+          objects: <number of objects to use>
+          max_in_flight: <max number of operations in flight>
+          object_size: <size of objects in bytes>
+          min_stride_size: <minimum write stride size in bytes>
+          max_stride_size: <maximum write stride size in bytes>
+          op_weights: <dictionary mapping operation type to integer weight>
+          runs: <number of times to run> - the pool is remade between runs
+          ec_pool: use an ec pool
+          erasure_code_profile: profile to use with the erasure coded pool
+          fast_read: enable ec_pool's fast_read
+          min_size: set the min_size of created pool
+          pool_snaps: use pool snapshots instead of selfmanaged snapshots
+	  write_fadvise_dontneed: write behavior like with LIBRADOS_OP_FLAG_FADVISE_DONTNEED.
+	                          This mean data don't access in the near future.
+				  Let osd backend don't keep data in cache.
+
+    For example::
+
+        tasks:
+        - ceph:
+        - rados:
+            clients: [client.0]
+            ops: 1000
+            max_seconds: 0   # 0 for no limit
+            objects: 25
+            max_in_flight: 16
+            object_size: 4000000
+            min_stride_size: 1024
+            max_stride_size: 4096
+            op_weights:
+              read: 20
+              write: 10
+              delete: 2
+              snap_create: 3
+              rollback: 2
+              snap_remove: 0
+            ec_pool: create an ec pool, defaults to False
+            erasure_code_use_overwrites: test overwrites, default false
+            erasure_code_profile:
+              name: teuthologyprofile
+              k: 2
+              m: 1
+              crush-failure-domain: osd
+            pool_snaps: true
+	    write_fadvise_dontneed: true
+            runs: 10
+        - interactive:
+
+    Optionally, you can provide the pool name to run against:
+
+        tasks:
+        - ceph:
+        - exec:
+            client.0:
+              - ceph osd pool create foo
+        - rados:
+            clients: [client.0]
+            pools: [foo]
+            ...
+
+    Alternatively, you can provide a pool prefix:
+
+        tasks:
+        - ceph:
+        - exec:
+            client.0:
+              - ceph osd pool create foo.client.0
+        - rados:
+            clients: [client.0]
+            pool_prefix: foo
+            ...
+
+    The tests are run asynchronously, they are not complete when the task
+    returns. For instance:
+
+        - rados:
+            clients: [client.0]
+            pools: [ecbase]
+            ops: 4000
+            objects: 500
+            op_weights:
+              read: 100
+              write: 100
+              delete: 50
+              copy_from: 50
+        - print: "**** done rados ec-cache-agent (part 2)"
+
+     will run the print task immediately after the rados tasks begins but
+     not after it completes. To make the rados task a blocking / sequential
+     task, use:
+
+        - sequential:
+          - rados:
+              clients: [client.0]
+              pools: [ecbase]
+              ops: 4000
+              objects: 500
+              op_weights:
+                read: 100
+                write: 100
+                delete: 50
+                copy_from: 50
+        - print: "**** done rados ec-cache-agent (part 2)"
+
+    """
+    log.info('Beginning rados...')
+    assert isinstance(config, dict), \
+        "please list clients to run on"
+
+    object_size = int(config.get('object_size', 4000000))
+    op_weights = config.get('op_weights', {})
+    testdir = teuthology.get_testdir(ctx)
+    args = [
+        'adjust-ulimits',
+        'ceph-coverage',
+        '{tdir}/archive/coverage'.format(tdir=testdir),
+        'ceph_test_rados']
+    if config.get('ec_pool', False):
+        args.extend(['--no-omap'])
+        if not config.get('erasure_code_use_overwrites', False):
+            args.extend(['--ec-pool'])
+    if config.get('write_fadvise_dontneed', False):
+        args.extend(['--write-fadvise-dontneed'])
+    if config.get('set_redirect', False):
+        args.extend(['--set_redirect'])
+    if config.get('set_chunk', False):
+        args.extend(['--set_chunk'])
+    if config.get('enable_dedup', False):
+        args.extend(['--enable_dedup'])
+    if config.get('low_tier_pool', None):
+        args.extend(['--low_tier_pool', config.get('low_tier_pool', None)])
+    if config.get('pool_snaps', False):
+        args.extend(['--pool-snaps'])
+    if config.get('balance_reads', False):
+        args.extend(['--balance-reads'])
+    if config.get('localize_reads', False):
+        args.extend(['--localize-reads'])
+    args.extend([
+        '--max-ops', str(config.get('ops', 10000)),
+        '--objects', str(config.get('objects', 500)),
+        '--max-in-flight', str(config.get('max_in_flight', 16)),
+        '--size', str(object_size),
+        '--min-stride-size', str(config.get('min_stride_size', object_size // 10)),
+        '--max-stride-size', str(config.get('max_stride_size', object_size // 5)),
+        '--max-seconds', str(config.get('max_seconds', 0))
+        ])
+
+    weights = {}
+    weights['read'] = 100
+    weights['write'] = 100
+    weights['delete'] = 10
+    # Parallel of the op_types in test/osd/TestRados.cc
+    for field in [
+        # read handled above
+        # write handled above
+        # delete handled above
+        "snap_create",
+        "snap_remove",
+        "rollback",
+        "setattr",
+        "rmattr",
+        "watch",
+        "copy_from",
+        "hit_set_list",
+        "is_dirty",
+        "undirty",
+        "cache_flush",
+        "cache_try_flush",
+        "cache_evict",
+        "append",
+        "write",
+        "read",
+        "delete"
+        ]:
+        if field in op_weights:
+            weights[field] = op_weights[field]
+
+    if config.get('write_append_excl', True):
+        if 'write' in weights:
+            weights['write'] = weights['write'] // 2
+            weights['write_excl'] = weights['write']
+
+        if 'append' in weights:
+            weights['append'] = weights['append'] // 2
+            weights['append_excl'] = weights['append']
+
+    for op, weight in weights.items():
+        args.extend([
+            '--op', op, str(weight)
+        ])
+                
+
+    def thread():
+        """Thread spawned by gevent"""
+        clients = ['client.{id}'.format(id=id_) for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+        log.info('clients are %s' % clients)
+        manager = ctx.managers['ceph']
+        if config.get('ec_pool', False):
+            profile = config.get('erasure_code_profile', {})
+            profile_name = profile.get('name', 'teuthologyprofile')
+            manager.create_erasure_code_profile(profile_name, profile)
+        else:
+            profile_name = None
+        for i in range(int(config.get('runs', '1'))):
+            log.info("starting run %s out of %s", str(i), config.get('runs', '1'))
+            tests = {}
+            existing_pools = config.get('pools', [])
+            created_pools = []
+            for role in config.get('clients', clients):
+                assert isinstance(role, str)
+                PREFIX = 'client.'
+                assert role.startswith(PREFIX)
+                id_ = role[len(PREFIX):]
+
+                pool = config.get('pool', None)
+                if not pool and existing_pools:
+                    pool = existing_pools.pop()
+                else:
+                    pool = manager.create_pool_with_unique_name(
+                        erasure_code_profile_name=profile_name,
+                        erasure_code_use_overwrites=
+                          config.get('erasure_code_use_overwrites', False)
+                    )
+                    created_pools.append(pool)
+                    if config.get('fast_read', False):
+                        manager.raw_cluster_cmd(
+                            'osd', 'pool', 'set', pool, 'fast_read', 'true')
+                    min_size = config.get('min_size', None);
+                    if min_size is not None:
+                        manager.raw_cluster_cmd(
+                            'osd', 'pool', 'set', pool, 'min_size', str(min_size))
+
+                (remote,) = ctx.cluster.only(role).remotes.keys()
+                proc = remote.run(
+                    args=["CEPH_CLIENT_ID={id_}".format(id_=id_)] + args +
+                    ["--pool", pool],
+                    logger=log.getChild("rados.{id}".format(id=id_)),
+                    stdin=run.PIPE,
+                    wait=False
+                    )
+                tests[id_] = proc
+            run.wait(tests.values())
+
+            for pool in created_pools:
+                manager.wait_snap_trimming_complete(pool);
+                manager.remove_pool(pool)
+
+    running = gevent.spawn(thread)
+
+    try:
+        yield
+    finally:
+        log.info('joining rados')
+        running.get()
diff --git a/qa/tasks/radosbench.py b/qa/tasks/radosbench.py
new file mode 100644
index 000000000..0804840c5
--- /dev/null
+++ b/qa/tasks/radosbench.py
@@ -0,0 +1,143 @@
+"""
+Rados benchmarking
+"""
+import contextlib
+import logging
+
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run radosbench
+
+    The config should be as follows:
+
+    radosbench:
+        clients: [client list]
+        time: <seconds to run>
+        pool: <pool to use>
+        size: write size to use
+        concurrency: max number of outstanding writes (16)
+        objectsize: object size to use
+        unique_pool: use a unique pool, defaults to False
+        ec_pool: create an ec pool, defaults to False
+        create_pool: create pool, defaults to True
+        erasure_code_profile:
+          name: teuthologyprofile
+          k: 2
+          m: 1
+          crush-failure-domain: osd
+        cleanup: false (defaults to true)
+        type: <write|seq|rand> (defaults to write)
+    example:
+
+    tasks:
+    - ceph:
+    - radosbench:
+        clients: [client.0]
+        time: 360
+    - interactive:
+    """
+    log.info('Beginning radosbench...')
+    assert isinstance(config, dict), \
+        "please list clients to run on"
+    radosbench = {}
+
+    testdir = teuthology.get_testdir(ctx)
+    manager = ctx.managers['ceph']
+    runtype = config.get('type', 'write')
+
+    create_pool = config.get('create_pool', True)
+    for role in config.get('clients', ['client.0']):
+        assert isinstance(role, str)
+        PREFIX = 'client.'
+        assert role.startswith(PREFIX)
+        id_ = role[len(PREFIX):]
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+
+        if config.get('ec_pool', False):
+            profile = config.get('erasure_code_profile', {})
+            profile_name = profile.get('name', 'teuthologyprofile')
+            manager.create_erasure_code_profile(profile_name, profile)
+        else:
+            profile_name = None
+
+        cleanup = []
+        if not config.get('cleanup', True):
+            cleanup = ['--no-cleanup']
+        write_to_omap = []
+        if config.get('write-omap', False):
+            write_to_omap = ['--write-omap']
+            log.info('omap writes')
+
+        pool = config.get('pool', 'data')
+        if create_pool:
+            if pool != 'data':
+                manager.create_pool(pool, erasure_code_profile_name=profile_name)
+            else:
+                pool = manager.create_pool_with_unique_name(erasure_code_profile_name=profile_name)
+
+        concurrency = config.get('concurrency', 16)
+        osize = config.get('objectsize', 65536)
+        if osize == 0:
+            objectsize = []
+        else:
+            objectsize = ['--object-size', str(osize)]
+        size = ['-b', str(config.get('size', 65536))]
+        # If doing a reading run then populate data
+        if runtype != "write":
+            proc = remote.run(
+                args=[
+                    "/bin/sh", "-c",
+                    " ".join(['adjust-ulimits',
+                              'ceph-coverage',
+                              '{tdir}/archive/coverage',
+                              'rados',
+                              '--no-log-to-stderr',
+                              '--name', role] +
+                              ['-t', str(concurrency)]
+                              + size + objectsize +
+                              ['-p' , pool,
+                          'bench', str(60), "write", "--no-cleanup"
+                          ]).format(tdir=testdir),
+                ],
+            logger=log.getChild('radosbench.{id}'.format(id=id_)),
+            wait=True
+            )
+            size = []
+            objectsize = []
+
+        proc = remote.run(
+            args=[
+                "/bin/sh", "-c",
+                " ".join(['adjust-ulimits',
+                          'ceph-coverage',
+                          '{tdir}/archive/coverage',
+                          'rados',
+			  '--no-log-to-stderr',
+                          '--name', role]
+                          + size + objectsize +
+                          ['-p' , pool,
+                          'bench', str(config.get('time', 360)), runtype,
+                          ] + write_to_omap + cleanup).format(tdir=testdir),
+                ],
+            logger=log.getChild('radosbench.{id}'.format(id=id_)),
+            stdin=run.PIPE,
+            wait=False
+            )
+        radosbench[id_] = proc
+
+    try:
+        yield
+    finally:
+        timeout = config.get('time', 360) * 30 + 300
+        log.info('joining radosbench (timing out after %ss)', timeout)
+        run.wait(radosbench.values(), timeout=timeout)
+
+        if pool != 'data' and create_pool:
+            manager.remove_pool(pool)
diff --git a/qa/tasks/radosbenchsweep.py b/qa/tasks/radosbenchsweep.py
new file mode 100644
index 000000000..df0ba1ed1
--- /dev/null
+++ b/qa/tasks/radosbenchsweep.py
@@ -0,0 +1,222 @@
+"""
+Rados benchmarking sweep
+"""
+import contextlib
+import logging
+import re
+
+from io import BytesIO
+from itertools import product
+
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Execute a radosbench parameter sweep
+
+    Puts radosbench in a loop, taking values from the given config at each
+    iteration. If given, the min and max values below create a range, e.g.
+    min_replicas=1 and max_replicas=3 implies executing with 1-3 replicas.
+
+    Parameters:
+
+        clients: [client list]
+        time: seconds to run (default=120)
+        sizes: [list of object sizes] (default=[4M])
+        mode: <write|read|seq> (default=write)
+        repetitions: execute the same configuration multiple times (default=1)
+        min_num_replicas: minimum number of replicas to use (default = 3)
+        max_num_replicas: maximum number of replicas to use (default = 3)
+        min_num_osds: the minimum number of OSDs in a pool (default=all)
+        max_num_osds: the maximum number of OSDs in a pool (default=all)
+        file: name of CSV-formatted output file (default='radosbench.csv')
+        columns: columns to include (default=all)
+          - rep: execution number (takes values from 'repetitions')
+          - num_osd: number of osds for pool
+          - num_replica: number of replicas
+          - avg_throughput: throughput
+          - avg_latency: latency
+          - stdev_throughput:
+          - stdev_latency:
+
+    Example:
+    - radsobenchsweep:
+        columns: [rep, num_osd, num_replica, avg_throughput, stdev_throughput]
+    """
+    log.info('Beginning radosbenchsweep...')
+    assert isinstance(config, dict), 'expecting dictionary for configuration'
+
+    # get and validate config values
+    # {
+
+    # only one client supported for now
+    if len(config.get('clients', [])) != 1:
+        raise Exception("Only one client can be specified")
+
+    # only write mode
+    if config.get('mode', 'write') != 'write':
+        raise Exception("Only 'write' mode supported for now.")
+
+    # OSDs
+    total_osds_in_cluster = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+    min_num_osds = config.get('min_num_osds', total_osds_in_cluster)
+    max_num_osds = config.get('max_num_osds', total_osds_in_cluster)
+
+    if max_num_osds > total_osds_in_cluster:
+        raise Exception('max_num_osds cannot be greater than total in cluster')
+    if min_num_osds < 1:
+        raise Exception('min_num_osds cannot be less than 1')
+    if min_num_osds > max_num_osds:
+        raise Exception('min_num_osds cannot be greater than max_num_osd')
+    osds = range(0, (total_osds_in_cluster + 1))
+
+    # replicas
+    min_num_replicas = config.get('min_num_replicas', 3)
+    max_num_replicas = config.get('max_num_replicas', 3)
+
+    if min_num_replicas < 1:
+        raise Exception('min_num_replicas cannot be less than 1')
+    if min_num_replicas > max_num_replicas:
+        raise Exception('min_num_replicas cannot be greater than max_replicas')
+    if max_num_replicas > max_num_osds:
+        raise Exception('max_num_replicas cannot be greater than max_num_osds')
+    replicas = range(min_num_replicas, (max_num_replicas + 1))
+
+    # object size
+    sizes = config.get('size', [4 << 20])
+
+    # repetitions
+    reps = range(config.get('repetitions', 1))
+
+    # file
+    fname = config.get('file', 'radosbench.csv')
+    f = open('{}/{}'.format(ctx.archive, fname), 'w')
+    f.write(get_csv_header(config) + '\n')
+    # }
+
+    # set default pools size=1 to avoid 'unhealthy' issues
+    ctx.manager.set_pool_property('data', 'size', 1)
+    ctx.manager.set_pool_property('metadata', 'size', 1)
+    ctx.manager.set_pool_property('rbd', 'size', 1)
+
+    current_osds_out = 0
+
+    # sweep through all parameters
+    for osds_out, size, replica, rep in product(osds, sizes, replicas, reps):
+
+        osds_in = total_osds_in_cluster - osds_out
+
+        if osds_in == 0:
+            # we're done
+            break
+
+        if current_osds_out != osds_out:
+            # take an osd out
+            ctx.manager.raw_cluster_cmd(
+                'osd', 'reweight', str(osds_out-1), '0.0')
+            wait_until_healthy(ctx, config)
+            current_osds_out = osds_out
+
+        if osds_in not in range(min_num_osds, (max_num_osds + 1)):
+            # no need to execute with a number of osds that wasn't requested
+            continue
+
+        if osds_in < replica:
+            # cannot execute with more replicas than available osds
+            continue
+
+        run_radosbench(ctx, config, f, osds_in, size, replica, rep)
+
+    f.close()
+
+    yield
+
+
+def get_csv_header(conf):
+    all_columns = [
+        'rep', 'num_osd', 'num_replica', 'avg_throughput',
+        'avg_latency', 'stdev_throughput', 'stdev_latency'
+    ]
+    given_columns = conf.get('columns', None)
+    if given_columns and len(given_columns) != 0:
+        for column in given_columns:
+            if column not in all_columns:
+                raise Exception('Unknown column ' + column)
+        return ','.join(conf['columns'])
+    else:
+        conf['columns'] = all_columns
+        return ','.join(all_columns)
+
+
+def run_radosbench(ctx, config, f, num_osds, size, replica, rep):
+    pool = ctx.manager.create_pool_with_unique_name()
+
+    ctx.manager.set_pool_property(pool, 'size', replica)
+
+    wait_until_healthy(ctx, config)
+
+    log.info('Executing with parameters: ')
+    log.info('  num_osd =' + str(num_osds))
+    log.info('  size =' + str(size))
+    log.info('  num_replicas =' + str(replica))
+    log.info('  repetition =' + str(rep))
+
+    for role in config.get('clients', ['client.0']):
+        assert isinstance(role, str)
+        PREFIX = 'client.'
+        assert role.startswith(PREFIX)
+        id_ = role[len(PREFIX):]
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+
+        proc = remote.run(
+            args=[
+                'adjust-ulimits',
+                'ceph-coverage',
+                '{}/archive/coverage'.format(teuthology.get_testdir(ctx)),
+                'rados',
+                '--no-log-to-stderr',
+                '--name', role,
+                '-b', str(size),
+                '-p', pool,
+                'bench', str(config.get('time', 120)), 'write',
+            ],
+            logger=log.getChild('radosbench.{id}'.format(id=id_)),
+            stdin=run.PIPE,
+            stdout=BytesIO(),
+            wait=False
+        )
+
+        # parse output to get summary and format it as CSV
+        proc.wait()
+        out = proc.stdout.getvalue()
+        all_values = {
+            'stdev_throughput': re.sub(r'Stddev Bandwidth: ', '', re.search(
+                r'Stddev Bandwidth:.*', out).group(0)),
+            'stdev_latency': re.sub(r'Stddev Latency: ', '', re.search(
+                r'Stddev Latency:.*', out).group(0)),
+            'avg_throughput': re.sub(r'Bandwidth \(MB/sec\): ', '', re.search(
+                r'Bandwidth \(MB/sec\):.*', out).group(0)),
+            'avg_latency': re.sub(r'Average Latency: ', '', re.search(
+                r'Average Latency:.*', out).group(0)),
+            'rep': str(rep),
+            'num_osd': str(num_osds),
+            'num_replica': str(replica)
+        }
+        values_to_write = []
+        for column in config['columns']:
+            values_to_write.extend([all_values[column]])
+        f.write(','.join(values_to_write) + '\n')
+
+    ctx.manager.remove_pool(pool)
+
+
+def wait_until_healthy(ctx, config):
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
+    teuthology.wait_until_healthy(ctx, mon_remote)
diff --git a/qa/tasks/radosgw_admin.py b/qa/tasks/radosgw_admin.py
new file mode 100644
index 000000000..836736f1a
--- /dev/null
+++ b/qa/tasks/radosgw_admin.py
@@ -0,0 +1,1088 @@
+"""
+Rgw admin testing against a running instance
+"""
+# The test cases in this file have been annotated for inventory.
+# To extract the inventory (in csv format) use the command:
+#
+#   grep '^ *# TESTCASE' | sed 's/^ *# TESTCASE //'
+#
+# to run this standalone:
+#	python qa/tasks/radosgw_admin.py [USER] HOSTNAME
+#
+
+import json
+import logging
+import time
+import datetime
+import sys
+
+from io import BytesIO
+from queue import Queue
+
+import boto.exception
+import boto.s3.connection
+import boto.s3.acl
+
+import httplib2
+
+
+from tasks.util.rgw import rgwadmin, get_user_summary, get_user_successful_ops
+
+log = logging.getLogger(__name__)
+
+def usage_acc_findentry2(entries, user, add=True):
+    for e in entries:
+        if e['user'] == user:
+            return e
+    if not add:
+            return None
+    e = {'user': user, 'buckets': []}
+    entries.append(e)
+    return e
+def usage_acc_findsum2(summaries, user, add=True):
+    for e in summaries:
+        if e['user'] == user:
+            return e
+    if not add:
+        return None
+    e = {'user': user, 'categories': [],
+        'total': {'bytes_received': 0,
+            'bytes_sent': 0, 'ops': 0, 'successful_ops': 0 }}
+    summaries.append(e)
+    return e
+def usage_acc_update2(x, out, b_in, err):
+    x['bytes_sent'] += b_in
+    x['bytes_received'] += out
+    x['ops'] += 1
+    if not err:
+        x['successful_ops'] += 1
+def usage_acc_validate_fields(r, x, x2, what):
+    q=[]
+    for field in ['bytes_sent', 'bytes_received', 'ops', 'successful_ops']:
+        try:
+            if x2[field] < x[field]:
+                q.append("field %s: %d < %d" % (field, x2[field], x[field]))
+        except Exception as ex:
+            r.append( "missing/bad field " + field + " in " + what + " " + str(ex))
+            return
+    if len(q) > 0:
+        r.append("incomplete counts in " + what + ": " + ", ".join(q))
+class usage_acc:
+    def __init__(self):
+        self.results = {'entries': [], 'summary': []}
+    def findentry(self, user):
+        return usage_acc_findentry2(self.results['entries'], user)
+    def findsum(self, user):
+        return usage_acc_findsum2(self.results['summary'], user)
+    def e2b(self, e, bucket, add=True):
+        for b in e['buckets']:
+            if b['bucket'] == bucket:
+                return b
+        if not add:
+                return None
+        b = {'bucket': bucket, 'categories': []}
+        e['buckets'].append(b)
+        return b
+    def c2x(self, c, cat, add=True):
+        for x in c:
+            if x['category'] == cat:
+                return x
+        if not add:
+                return None
+        x = {'bytes_received': 0, 'category': cat,
+            'bytes_sent': 0, 'ops': 0, 'successful_ops': 0 }
+        c.append(x)
+        return x
+    def update(self, c, cat, user, out, b_in, err):
+        x = self.c2x(c, cat)
+        usage_acc_update2(x, out, b_in, err)
+        if not err and cat == 'create_bucket' and 'owner' not in x:
+            x['owner'] = user
+    def make_entry(self, cat, bucket, user, out, b_in, err):
+        if cat == 'create_bucket' and err:
+                return
+        e = self.findentry(user)
+        b = self.e2b(e, bucket)
+        self.update(b['categories'], cat, user, out, b_in, err)
+        s = self.findsum(user)
+        x = self.c2x(s['categories'], cat)
+        usage_acc_update2(x, out, b_in, err)
+        x = s['total']
+        usage_acc_update2(x, out, b_in, err)
+    def generate_make_entry(self):
+        return lambda cat,bucket,user,out,b_in,err: self.make_entry(cat, bucket, user, out, b_in, err)
+    def get_usage(self):
+        return self.results
+    def compare_results(self, results):
+        if 'entries' not in results or 'summary' not in results:
+            return ['Missing entries or summary']
+        r = []
+        for e in self.results['entries']:
+            try:
+                e2 = usage_acc_findentry2(results['entries'], e['user'], False)
+            except Exception as ex:
+                r.append("malformed entry looking for user "
+		    + e['user'] + " " + str(ex))
+                break
+            if e2 == None:
+                r.append("missing entry for user " + e['user'])
+                continue
+            for b in e['buckets']:
+                c = b['categories']
+                if b['bucket'] == 'nosuchbucket':
+                    print("got here")
+                try:
+                    b2 = self.e2b(e2, b['bucket'], False)
+                    if b2 != None:
+                            c2 = b2['categories']
+                except Exception as ex:
+                    r.append("malformed entry looking for bucket "
+			+ b['bucket'] + " in user " + e['user'] + " " + str(ex))
+                    break
+                if b2 == None:
+                    r.append("can't find bucket " + b['bucket']
+			+ " in user " + e['user'])
+                    continue
+                for x in c:
+                    try:
+                        x2 = self.c2x(c2, x['category'], False)
+                    except Exception as ex:
+                        r.append("malformed entry looking for "
+			    + x['category'] + " in bucket " + b['bucket']
+			    + " user " + e['user'] + " " + str(ex))
+                        break
+                    usage_acc_validate_fields(r, x, x2, "entry: category "
+			+ x['category'] + " bucket " + b['bucket']
+			+ " in user " + e['user'])
+        for s in self.results['summary']:
+            c = s['categories']
+            try:
+                s2 = usage_acc_findsum2(results['summary'], s['user'], False)
+            except Exception as ex:
+                r.append("malformed summary looking for user " + e['user']
+		    + " " + str(ex))
+                break
+                if s2 == None:
+                    r.append("missing summary for user " + e['user'] + " " + str(ex))
+                    continue
+            try:
+                c2 = s2['categories']
+            except Exception as ex:
+                r.append("malformed summary missing categories for user "
+		    + e['user'] + " " + str(ex))
+                break
+            for x in c:
+                try:
+                    x2 = self.c2x(c2, x['category'], False)
+                except Exception as ex:
+                    r.append("malformed summary looking for "
+			+ x['category'] + " user " + e['user'] + " " + str(ex))
+                    break
+                usage_acc_validate_fields(r, x, x2, "summary: category "
+		    + x['category'] + " in user " + e['user'])
+            x = s['total']
+            try:
+                x2 = s2['total']
+            except Exception as ex:
+                r.append("malformed summary looking for totals for user "
+                         + e['user'] + " " + str(ex))
+                break
+            usage_acc_validate_fields(r, x, x2, "summary: totals for user" + e['user'])
+        return r
+
+def ignore_this_entry(cat, bucket, user, out, b_in, err):
+    pass
+class requestlog_queue():
+    def __init__(self, add):
+        self.q = Queue(1000)
+        self.adder = add
+    def handle_request_data(self, request, response, error=False):
+        now = datetime.datetime.now()
+        if error:
+            pass
+        elif response.status < 200 or response.status >= 400:
+            error = True
+        self.q.put({'t': now, 'o': request, 'i': response, 'e': error})
+    def clear(self):
+        with self.q.mutex:
+            self.q.queue.clear()
+    def log_and_clear(self, cat, bucket, user, add_entry = None):
+        while not self.q.empty():
+            j = self.q.get()
+            bytes_out = 0
+            if 'Content-Length' in j['o'].headers:
+                bytes_out = int(j['o'].headers['Content-Length'])
+            bytes_in = 0
+            msg = j['i'].msg
+            if 'content-length'in msg:
+                bytes_in = int(msg['content-length'])
+            log.info('RL: %s %s %s bytes_out=%d bytes_in=%d failed=%r'
+                     % (cat, bucket, user, bytes_out, bytes_in, j['e']))
+            if add_entry == None:
+                add_entry = self.adder
+            add_entry(cat, bucket, user, bytes_out, bytes_in, j['e'])
+
+def create_presigned_url(conn, method, bucket_name, key_name, expiration):
+    return conn.generate_url(expires_in=expiration,
+        method=method,
+        bucket=bucket_name,
+        key=key_name,
+        query_auth=True,
+    )
+
+def send_raw_http_request(conn, method, bucket_name, key_name, follow_redirects = False):
+    url = create_presigned_url(conn, method, bucket_name, key_name, 3600)
+    print(url)
+    h = httplib2.Http()
+    h.follow_redirects = follow_redirects
+    return h.request(url, method)
+
+
+def get_acl(key):
+    """
+    Helper function to get the xml acl from a key, ensuring that the xml
+    version tag is removed from the acl response
+    """
+    raw_acl = key.get_xml_acl().decode()
+
+    def remove_version(string):
+        return string.split(
+            '<?xml version="1.0" encoding="UTF-8"?>'
+        )[-1]
+
+    def remove_newlines(string):
+        return string.strip('\n')
+
+    return remove_version(
+        remove_newlines(raw_acl)
+    )
+
+def task(ctx, config):
+    """
+    Test radosgw-admin functionality against a running rgw instance.
+    """
+    global log
+
+    assert ctx.rgw.config, \
+        "radosgw_admin task needs a config passed from the rgw task"
+    config = ctx.rgw.config
+    log.debug('config is: %r', config)
+
+    clients_from_config = config.keys()
+
+    # choose first client as default
+    client = next(iter(clients_from_config))
+
+    # once the client is chosen, pull the host name and  assigned port out of
+    # the role_endpoints that were assigned by the rgw task
+    endpoint = ctx.rgw.role_endpoints[client]
+
+    ##
+    user1='foo'
+    user2='fud'
+    user3='bar'
+    user4='bud'
+    subuser1='foo:foo1'
+    subuser2='foo:foo2'
+    display_name1='Foo'
+    display_name2='Fud'
+    display_name3='Bar'
+    email='foo@foo.com'
+    access_key='9te6NH5mcdcq0Tc5i8i1'
+    secret_key='Ny4IOauQoL18Gp2zM7lC1vLmoawgqcYP/YGcWfXu'
+    access_key2='p5YnriCv1nAtykxBrupQ'
+    secret_key2='Q8Tk6Q/27hfbFSYdSkPtUqhqx1GgzvpXa4WARozh'
+    access_key3='NX5QOQKC6BH2IDN8HC7A'
+    secret_key3='LnEsqNNqZIpkzauboDcLXLcYaWwLQ3Kop0zAnKIn'
+    swift_secret1='gpS2G9RREMrnbqlp29PP2D36kgPR1tm72n5fPYfL'
+    swift_secret2='ri2VJQcKSYATOY6uaDUX7pxgkW+W1YmC6OCxPHwy'
+
+    bucket_name='myfoo'
+    bucket_name2='mybar'
+
+    # connect to rgw
+    connection = boto.s3.connection.S3Connection(
+        aws_access_key_id=access_key,
+        aws_secret_access_key=secret_key,
+        is_secure=False,
+        port=endpoint.port,
+        host=endpoint.hostname,
+        calling_format=boto.s3.connection.OrdinaryCallingFormat(),
+        )
+    connection2 = boto.s3.connection.S3Connection(
+        aws_access_key_id=access_key2,
+        aws_secret_access_key=secret_key2,
+        is_secure=False,
+        port=endpoint.port,
+        host=endpoint.hostname,
+        calling_format=boto.s3.connection.OrdinaryCallingFormat(),
+        )
+    connection3 = boto.s3.connection.S3Connection(
+        aws_access_key_id=access_key3,
+        aws_secret_access_key=secret_key3,
+        is_secure=False,
+        port=endpoint.port,
+        host=endpoint.hostname,
+        calling_format=boto.s3.connection.OrdinaryCallingFormat(),
+        )
+
+    acc = usage_acc()
+    rl = requestlog_queue(acc.generate_make_entry())
+    connection.set_request_hook(rl)
+    connection2.set_request_hook(rl)
+    connection3.set_request_hook(rl)
+
+    # legend (test cases can be easily grep-ed out)
+    # TESTCASE 'testname','object','method','operation','assertion'
+
+    # TESTCASE 'usage-show0' 'usage' 'show' 'all usage' 'succeeds'
+    (err, summary0) = rgwadmin(ctx, client, ['usage', 'show'], check_status=True)
+
+    # TESTCASE 'info-nosuch','user','info','non-existent user','fails'
+    (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1])
+    assert err
+
+    # TESTCASE 'create-ok','user','create','w/all valid info','succeeds'
+    (err, out) = rgwadmin(ctx, client, [
+            'user', 'create',
+            '--uid', user1,
+            '--display-name', display_name1,
+            '--email', email,
+            '--access-key', access_key,
+            '--secret', secret_key,
+            '--max-buckets', '4'
+            ],
+            check_status=True)
+
+    # TESTCASE 'duplicate email','user','create','existing user email','fails'
+    (err, out) = rgwadmin(ctx, client, [
+            'user', 'create',
+            '--uid', user2,
+            '--display-name', display_name2,
+            '--email', email,
+            ])
+    assert err
+
+    # TESTCASE 'info-existing','user','info','existing user','returns correct info'
+    (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True)
+    assert out['user_id'] == user1
+    assert out['email'] == email
+    assert out['display_name'] == display_name1
+    assert len(out['keys']) == 1
+    assert out['keys'][0]['access_key'] == access_key
+    assert out['keys'][0]['secret_key'] == secret_key
+    assert not out['suspended']
+
+    # TESTCASE 'suspend-ok','user','suspend','active user','succeeds'
+    (err, out) = rgwadmin(ctx, client, ['user', 'suspend', '--uid', user1],
+        check_status=True)
+
+    # TESTCASE 'suspend-suspended','user','suspend','suspended user','succeeds w/advisory'
+    (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True)
+    assert out['suspended']
+
+    # TESTCASE 're-enable','user','enable','suspended user','succeeds'
+    (err, out) = rgwadmin(ctx, client, ['user', 'enable', '--uid', user1], check_status=True)
+
+    # TESTCASE 'info-re-enabled','user','info','re-enabled user','no longer suspended'
+    (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True)
+    assert not out['suspended']
+
+    # TESTCASE 'add-keys','key','create','w/valid info','succeeds'
+    (err, out) = rgwadmin(ctx, client, [
+            'key', 'create', '--uid', user1,
+            '--access-key', access_key2, '--secret', secret_key2,
+            ], check_status=True)
+
+    # TESTCASE 'info-new-key','user','info','after key addition','returns all keys'
+    (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1],
+        check_status=True)
+    assert len(out['keys']) == 2
+    assert out['keys'][0]['access_key'] == access_key2 or out['keys'][1]['access_key'] == access_key2
+    assert out['keys'][0]['secret_key'] == secret_key2 or out['keys'][1]['secret_key'] == secret_key2
+
+    # TESTCASE 'rm-key','key','rm','newly added key','succeeds, key is removed'
+    (err, out) = rgwadmin(ctx, client, [
+            'key', 'rm', '--uid', user1,
+            '--access-key', access_key2,
+            ], check_status=True)
+    assert len(out['keys']) == 1
+    assert out['keys'][0]['access_key'] == access_key
+    assert out['keys'][0]['secret_key'] == secret_key
+
+    # TESTCASE 'add-swift-key','key','create','swift key','succeeds'
+    subuser_access = 'full'
+    subuser_perm = 'full-control'
+
+    (err, out) = rgwadmin(ctx, client, [
+            'subuser', 'create', '--subuser', subuser1,
+            '--access', subuser_access
+            ], check_status=True)
+
+    # TESTCASE 'add-swift-key','key','create','swift key','succeeds'
+    (err, out) = rgwadmin(ctx, client, [
+            'subuser', 'modify', '--subuser', subuser1,
+            '--secret', swift_secret1,
+            '--key-type', 'swift',
+            ], check_status=True)
+
+    # TESTCASE 'subuser-perm-mask', 'subuser', 'info', 'test subuser perm mask durability', 'succeeds'
+    (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1])
+
+    assert out['subusers'][0]['permissions'] == subuser_perm
+
+    # TESTCASE 'info-swift-key','user','info','after key addition','returns all keys'
+    (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True)
+    assert len(out['swift_keys']) == 1
+    assert out['swift_keys'][0]['user'] == subuser1
+    assert out['swift_keys'][0]['secret_key'] == swift_secret1
+
+    # TESTCASE 'add-swift-subuser','key','create','swift sub-user key','succeeds'
+    (err, out) = rgwadmin(ctx, client, [
+            'subuser', 'create', '--subuser', subuser2,
+            '--secret', swift_secret2,
+            '--key-type', 'swift',
+            ], check_status=True)
+
+    # TESTCASE 'info-swift-subuser','user','info','after key addition','returns all sub-users/keys'
+    (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True)
+    assert len(out['swift_keys']) == 2
+    assert out['swift_keys'][0]['user'] == subuser2 or out['swift_keys'][1]['user'] == subuser2
+    assert out['swift_keys'][0]['secret_key'] == swift_secret2 or out['swift_keys'][1]['secret_key'] == swift_secret2
+
+    # TESTCASE 'rm-swift-key1','key','rm','subuser','succeeds, one key is removed'
+    (err, out) = rgwadmin(ctx, client, [
+            'key', 'rm', '--subuser', subuser1,
+            '--key-type', 'swift',
+            ], check_status=True)
+    assert len(out['swift_keys']) == 1
+
+    # TESTCASE 'rm-subuser','subuser','rm','subuser','success, subuser is removed'
+    (err, out) = rgwadmin(ctx, client, [
+            'subuser', 'rm', '--subuser', subuser1,
+            ], check_status=True)
+    assert len(out['subusers']) == 1
+
+    # TESTCASE 'rm-subuser-with-keys','subuser','rm','subuser','succeeds, second subser and key is removed'
+    (err, out) = rgwadmin(ctx, client, [
+            'subuser', 'rm', '--subuser', subuser2,
+            '--key-type', 'swift', '--purge-keys',
+            ], check_status=True)
+    assert len(out['swift_keys']) == 0
+    assert len(out['subusers']) == 0
+
+    # TESTCASE 'bucket-stats','bucket','stats','no session/buckets','succeeds, empty list'
+    (err, out) = rgwadmin(ctx, client, ['bucket', 'stats', '--uid', user1],
+        check_status=True)
+    assert len(out) == 0
+
+    # TESTCASE 'bucket-stats2','bucket','stats','no buckets','succeeds, empty list'
+    (err, out) = rgwadmin(ctx, client, ['bucket', 'list', '--uid', user1], check_status=True)
+    assert len(out) == 0
+
+    # create a first bucket
+    bucket = connection.create_bucket(bucket_name)
+
+    rl.log_and_clear("create_bucket", bucket_name, user1)
+
+    # TESTCASE 'bucket-list','bucket','list','one bucket','succeeds, expected list'
+    (err, out) = rgwadmin(ctx, client, ['bucket', 'list', '--uid', user1], check_status=True)
+    assert len(out) == 1
+    assert out[0] == bucket_name
+
+    bucket_list = connection.get_all_buckets()
+    assert len(bucket_list) == 1
+    assert bucket_list[0].name == bucket_name
+
+    rl.log_and_clear("list_buckets", '', user1)
+
+    # TESTCASE 'bucket-list-all','bucket','list','all buckets','succeeds, expected list'
+    (err, out) = rgwadmin(ctx, client, ['bucket', 'list'], check_status=True)
+    assert len(out) >= 1
+    assert bucket_name in out;
+
+    # TESTCASE 'max-bucket-limit,'bucket','create','4 buckets','5th bucket fails due to max buckets == 4'
+    bucket2 = connection.create_bucket(bucket_name + '2')
+    rl.log_and_clear("create_bucket", bucket_name + '2', user1)
+    bucket3 = connection.create_bucket(bucket_name + '3')
+    rl.log_and_clear("create_bucket", bucket_name + '3', user1)
+    bucket4 = connection.create_bucket(bucket_name + '4')
+    rl.log_and_clear("create_bucket", bucket_name + '4', user1)
+    # the 5th should fail.
+    failed = False
+    try:
+        connection.create_bucket(bucket_name + '5')
+    except Exception:
+        failed = True
+    assert failed
+    rl.log_and_clear("create_bucket", bucket_name + '5', user1)
+
+    # delete the buckets
+    bucket2.delete()
+    rl.log_and_clear("delete_bucket", bucket_name + '2', user1)
+    bucket3.delete()
+    rl.log_and_clear("delete_bucket", bucket_name + '3', user1)
+    bucket4.delete()
+    rl.log_and_clear("delete_bucket", bucket_name + '4', user1)
+
+    # TESTCASE 'bucket-stats3','bucket','stats','new empty bucket','succeeds, empty list'
+    (err, out) = rgwadmin(ctx, client, [
+            'bucket', 'stats', '--bucket', bucket_name], check_status=True)
+    assert out['owner'] == user1
+    bucket_id = out['id']
+
+    # TESTCASE 'bucket-stats4','bucket','stats','new empty bucket','succeeds, expected bucket ID'
+    (err, out) = rgwadmin(ctx, client, ['bucket', 'stats', '--uid', user1], check_status=True)
+    assert len(out) == 1
+    assert out[0]['id'] == bucket_id    # does it return the same ID twice in a row?
+
+    # use some space
+    key = boto.s3.key.Key(bucket)
+    key.set_contents_from_string('one')
+    rl.log_and_clear("put_obj", bucket_name, user1)
+
+    # TESTCASE 'bucket-stats5','bucket','stats','after creating key','succeeds, lists one non-empty object'
+    (err, out) = rgwadmin(ctx, client, [
+            'bucket', 'stats', '--bucket', bucket_name], check_status=True)
+    assert out['id'] == bucket_id
+    assert out['usage']['rgw.main']['num_objects'] == 1
+    assert out['usage']['rgw.main']['size_kb'] > 0
+
+    #validate we have a positive user stats now
+    (err, out) = rgwadmin(ctx, client,
+                          ['user', 'stats','--uid', user1, '--sync-stats'],
+                          check_status=True)
+    assert out['stats']['size'] > 0
+
+    # reclaim it
+    key.delete()
+    rl.log_and_clear("delete_obj", bucket_name, user1)
+
+    # TESTCASE 'bucket unlink', 'bucket', 'unlink', 'unlink bucket from user', 'fails', 'access denied error'
+    (err, out) = rgwadmin(ctx, client,
+        ['bucket', 'unlink', '--uid', user1, '--bucket', bucket_name],
+        check_status=True)
+
+    # create a second user to link the bucket to
+    (err, out) = rgwadmin(ctx, client, [
+            'user', 'create',
+            '--uid', user2,
+            '--display-name', display_name2,
+            '--access-key', access_key2,
+            '--secret', secret_key2,
+            '--max-buckets', '1',
+            ],
+            check_status=True)
+
+    # try creating an object with the first user before the bucket is relinked
+    denied = False
+    key = boto.s3.key.Key(bucket)
+
+    try:
+        key.set_contents_from_string('two')
+    except boto.exception.S3ResponseError:
+        denied = True
+
+    assert not denied
+    rl.log_and_clear("put_obj", bucket_name, user1)
+
+    # delete the object
+    key.delete()
+    rl.log_and_clear("delete_obj", bucket_name, user1)
+
+    # link the bucket to another user
+    (err, out) = rgwadmin(ctx, client, ['metadata', 'get', 'bucket:{n}'.format(n=bucket_name)],
+        check_status=True)
+
+    bucket_data = out['data']
+    assert bucket_data['bucket']['name'] == bucket_name
+
+    bucket_id = bucket_data['bucket']['bucket_id']
+
+    # link the bucket to another user
+    (err, out) = rgwadmin(ctx, client, ['bucket', 'link', '--uid', user2, '--bucket', bucket_name, '--bucket-id', bucket_id],
+        check_status=True)
+
+    # try to remove user, should fail (has a linked bucket)
+    (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user2])
+    assert err
+
+    # TESTCASE 'bucket unlink', 'bucket', 'unlink', 'unlink bucket from user', 'succeeds, bucket unlinked'
+    (err, out) = rgwadmin(ctx, client, ['bucket', 'unlink', '--uid', user2, '--bucket', bucket_name],
+        check_status=True)
+
+    # relink the bucket to the first user and delete the second user
+    (err, out) = rgwadmin(ctx, client,
+        ['bucket', 'link', '--uid', user1, '--bucket', bucket_name, '--bucket-id', bucket_id],
+        check_status=True)
+
+    (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user2],
+        check_status=True)
+
+    #TESTCASE 'bucket link', 'bucket', 'tenanted user', 'succeeds'
+    tenant_name = "testx"
+    # create a tenanted user to link the bucket to
+    (err, out) = rgwadmin(ctx, client, [
+            'user', 'create',
+            '--tenant', tenant_name,
+            '--uid', 'tenanteduser',
+            '--display-name', 'tenanted-user',
+            '--access-key', access_key2,
+            '--secret', secret_key2,
+            '--max-buckets', '1',
+            ],
+            check_status=True)
+
+    # link the bucket to a tenanted user
+    (err, out) = rgwadmin(ctx, client, ['bucket', 'link', '--bucket', '/' + bucket_name, '--tenant', tenant_name, '--uid', 'tenanteduser'],
+        check_status=True)
+    
+    # check if the bucket name has tenant/ prefix
+    (err, out) = rgwadmin(ctx, client, ['metadata', 'get', 'bucket:{n}'.format(n= tenant_name + '/' + bucket_name)],
+        check_status=True)
+
+    bucket_data = out['data']
+    assert bucket_data['bucket']['name'] == bucket_name
+    assert bucket_data['bucket']['tenant'] == tenant_name
+
+    # relink the bucket to the first user and delete the tenanted user
+    (err, out) = rgwadmin(ctx, client,
+        ['bucket', 'link', '--bucket', tenant_name + '/' + bucket_name, '--uid', user1],
+        check_status=True)
+
+    (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--tenant', tenant_name, '--uid', 'tenanteduser'],
+        check_status=True)
+
+    # TESTCASE 'object-rm', 'object', 'rm', 'remove object', 'succeeds, object is removed'
+
+    # upload an object
+    object_name = 'four'
+    key = boto.s3.key.Key(bucket, object_name)
+    key.set_contents_from_string(object_name)
+    rl.log_and_clear("put_obj", bucket_name, user1)
+
+    # fetch it too (for usage stats presently)
+    s = key.get_contents_as_string(encoding='ascii')
+    rl.log_and_clear("get_obj", bucket_name, user1)
+    assert s == object_name
+    # list bucket too (for usage stats presently)
+    keys = list(bucket.list())
+    rl.log_and_clear("list_bucket", bucket_name, user1)
+    assert len(keys) == 1
+    assert keys[0].name == object_name
+
+    # now delete it
+    (err, out) = rgwadmin(ctx, client,
+        ['object', 'rm', '--bucket', bucket_name, '--object', object_name],
+        check_status=True)
+
+    # TESTCASE 'bucket-stats6','bucket','stats','after deleting key','succeeds, lists one no objects'
+    (err, out) = rgwadmin(ctx, client, [
+            'bucket', 'stats', '--bucket', bucket_name],
+            check_status=True)
+    assert out['id'] == bucket_id
+    assert out['usage']['rgw.main']['num_objects'] == 0
+
+    # list log objects
+    # TESTCASE 'log-list','log','list','after activity','succeeds, lists one no objects'
+    (err, out) = rgwadmin(ctx, client, ['log', 'list'], check_status=True)
+    assert len(out) > 0
+
+    for obj in out:
+        # TESTCASE 'log-show','log','show','after activity','returns expected info'
+        if obj[:4] == 'meta' or obj[:4] == 'data' or obj[:18] == 'obj_delete_at_hint':
+            continue
+
+        (err, rgwlog) = rgwadmin(ctx, client, ['log', 'show', '--object', obj],
+            check_status=True)
+        assert len(rgwlog) > 0
+
+        # exempt bucket_name2 from checking as it was only used for multi-region tests
+        assert rgwlog['bucket'].find(bucket_name) == 0 or rgwlog['bucket'].find(bucket_name2) == 0
+        assert rgwlog['bucket'] != bucket_name or rgwlog['bucket_id'] == bucket_id
+        assert rgwlog['bucket_owner'] == user1 or rgwlog['bucket'] == bucket_name + '5' or rgwlog['bucket'] == bucket_name2
+        for entry in rgwlog['log_entries']:
+            log.debug('checking log entry: ', entry)
+            assert entry['bucket'] == rgwlog['bucket']
+            possible_buckets = [bucket_name + '5', bucket_name2]
+            user = entry['user']
+            assert user == user1 or user.endswith('system-user') or \
+                rgwlog['bucket'] in possible_buckets
+
+        # TESTCASE 'log-rm','log','rm','delete log objects','succeeds'
+        (err, out) = rgwadmin(ctx, client, ['log', 'rm', '--object', obj],
+            check_status=True)
+
+    # TODO: show log by bucket+date
+
+    # TESTCASE 'user-suspend2','user','suspend','existing user','succeeds'
+    (err, out) = rgwadmin(ctx, client, ['user', 'suspend', '--uid', user1],
+        check_status=True)
+
+    # TESTCASE 'user-suspend3','user','suspend','suspended user','cannot write objects'
+    denied = False
+    try:
+        key = boto.s3.key.Key(bucket)
+        key.set_contents_from_string('five')
+    except boto.exception.S3ResponseError as e:
+        denied = True
+        assert e.status == 403
+
+    assert denied
+    rl.log_and_clear("put_obj", bucket_name, user1)
+
+    # TESTCASE 'user-renable2','user','enable','suspended user','succeeds'
+    (err, out) = rgwadmin(ctx, client, ['user', 'enable', '--uid', user1],
+        check_status=True)
+
+    # TESTCASE 'user-renable3','user','enable','reenabled user','can write objects'
+    key = boto.s3.key.Key(bucket)
+    key.set_contents_from_string('six')
+    rl.log_and_clear("put_obj", bucket_name, user1)
+
+    # TESTCASE 'gc-list', 'gc', 'list', 'get list of objects ready for garbage collection'
+
+    # create an object large enough to be split into multiple parts
+    test_string = 'foo'*10000000
+
+    big_key = boto.s3.key.Key(bucket)
+    big_key.set_contents_from_string(test_string)
+    rl.log_and_clear("put_obj", bucket_name, user1)
+
+    # now delete the head
+    big_key.delete()
+    rl.log_and_clear("delete_obj", bucket_name, user1)
+
+    # wait a bit to give the garbage collector time to cycle
+    time.sleep(15)
+
+    (err, out) = rgwadmin(ctx, client, ['gc', 'list'])
+
+    assert len(out) > 0
+
+    # TESTCASE 'gc-process', 'gc', 'process', 'manually collect garbage'
+    (err, out) = rgwadmin(ctx, client, ['gc', 'process'], check_status=True)
+
+    #confirm
+    (err, out) = rgwadmin(ctx, client, ['gc', 'list'])
+
+    assert len(out) == 0
+
+    # TESTCASE 'rm-user-buckets','user','rm','existing user','fails, still has buckets'
+    (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user1])
+    assert err
+
+    # delete should fail because ``key`` still exists
+    try:
+        bucket.delete()
+    except boto.exception.S3ResponseError as e:
+        assert e.status == 409
+    rl.log_and_clear("delete_bucket", bucket_name, user1)
+
+    key.delete()
+    rl.log_and_clear("delete_obj", bucket_name, user1)
+    bucket.delete()
+    rl.log_and_clear("delete_bucket", bucket_name, user1)
+
+    # TESTCASE 'policy', 'bucket', 'policy', 'get bucket policy', 'returns S3 policy'
+    bucket = connection.create_bucket(bucket_name)
+    rl.log_and_clear("create_bucket", bucket_name, user1)
+
+    # create an object
+    key = boto.s3.key.Key(bucket)
+    key.set_contents_from_string('seven')
+    rl.log_and_clear("put_obj", bucket_name, user1)
+
+    # should be private already but guarantee it
+    key.set_acl('private')
+    rl.log_and_clear("put_acls", bucket_name, user1)
+
+    (err, out) = rgwadmin(ctx, client,
+        ['policy', '--bucket', bucket.name, '--object', key.key.decode()],
+        check_status=True, format='xml')
+
+    acl = get_acl(key)
+    rl.log_and_clear("get_acls", bucket_name, user1)
+
+    assert acl == out.strip('\n')
+
+    # add another grantee by making the object public read
+    key.set_acl('public-read')
+    rl.log_and_clear("put_acls", bucket_name, user1)
+
+    (err, out) = rgwadmin(ctx, client,
+        ['policy', '--bucket', bucket.name, '--object', key.key.decode()],
+        check_status=True, format='xml')
+
+    acl = get_acl(key)
+    rl.log_and_clear("get_acls", bucket_name, user1)
+
+    assert acl == out.strip('\n')
+
+    # TESTCASE 'rm-bucket', 'bucket', 'rm', 'bucket with objects', 'succeeds'
+    bucket = connection.create_bucket(bucket_name)
+    rl.log_and_clear("create_bucket", bucket_name, user1)
+    key_name = ['eight', 'nine', 'ten', 'eleven']
+    for i in range(4):
+        key = boto.s3.key.Key(bucket)
+        key.set_contents_from_string(key_name[i])
+    rl.log_and_clear("put_obj", bucket_name, user1)
+
+    (err, out) = rgwadmin(ctx, client,
+        ['bucket', 'rm', '--bucket', bucket_name, '--purge-objects'],
+        check_status=True)
+
+    # TESTCASE 'caps-add', 'caps', 'add', 'add user cap', 'succeeds'
+    caps='user=read'
+    (err, out) = rgwadmin(ctx, client, ['caps', 'add', '--uid', user1, '--caps', caps])
+
+    assert out['caps'][0]['perm'] == 'read'
+
+    # TESTCASE 'caps-rm', 'caps', 'rm', 'remove existing cap from user', 'succeeds'
+    (err, out) = rgwadmin(ctx, client, ['caps', 'rm', '--uid', user1, '--caps', caps])
+
+    assert not out['caps']
+
+    # TESTCASE 'rm-user','user','rm','existing user','fails, still has buckets'
+    bucket = connection.create_bucket(bucket_name)
+    rl.log_and_clear("create_bucket", bucket_name, user1)
+    key = boto.s3.key.Key(bucket)
+
+    (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user1])
+    assert err
+
+    # TESTCASE 'rm-user2', 'user', 'rm', 'user with data', 'succeeds'
+    bucket = connection.create_bucket(bucket_name)
+    rl.log_and_clear("create_bucket", bucket_name, user1)
+    key = boto.s3.key.Key(bucket)
+    key.set_contents_from_string('twelve')
+    rl.log_and_clear("put_obj", bucket_name, user1)
+
+    time.sleep(35)
+
+    # need to wait for all usage data to get flushed, should take up to 30 seconds
+    timestamp = time.time()
+    while time.time() - timestamp <= (2 * 60):      # wait up to 20 minutes
+        (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--categories', 'delete_obj'])  # one of the operations we did is delete_obj, should be present.
+        if get_user_successful_ops(out, user1) > 0:
+            break
+        time.sleep(1)
+
+    assert time.time() - timestamp <= (20 * 60)
+
+    # TESTCASE 'usage-show' 'usage' 'show' 'all usage' 'succeeds'
+    (err, out) = rgwadmin(ctx, client, ['usage', 'show'], check_status=True)
+    assert len(out['entries']) > 0
+    assert len(out['summary']) > 0
+
+    r = acc.compare_results(out)
+    if len(r) != 0:
+        sys.stderr.write(("\n".join(r))+"\n")
+        assert(len(r) == 0)
+
+    user_summary = get_user_summary(out, user1)
+
+    total = user_summary['total']
+    assert total['successful_ops'] > 0
+
+    # TESTCASE 'usage-show2' 'usage' 'show' 'user usage' 'succeeds'
+    (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1],
+        check_status=True)
+    assert len(out['entries']) > 0
+    assert len(out['summary']) > 0
+    user_summary = out['summary'][0]
+    for entry in user_summary['categories']:
+        assert entry['successful_ops'] > 0
+    assert user_summary['user'] == user1
+
+    # TESTCASE 'usage-show3' 'usage' 'show' 'user usage categories' 'succeeds'
+    test_categories = ['create_bucket', 'put_obj', 'delete_obj', 'delete_bucket']
+    for cat in test_categories:
+        (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1, '--categories', cat],
+            check_status=True)
+        assert len(out['summary']) > 0
+        user_summary = out['summary'][0]
+        assert user_summary['user'] == user1
+        assert len(user_summary['categories']) == 1
+        entry = user_summary['categories'][0]
+        assert entry['category'] == cat
+        assert entry['successful_ops'] > 0
+
+    # TESTCASE 'user-rename', 'user', 'rename', 'existing user', 'new user', 'succeeds'
+    # create a new user user3
+    (err, out) = rgwadmin(ctx, client, [
+        'user', 'create',
+        '--uid', user3,
+        '--display-name', display_name3,
+        '--access-key', access_key3,
+        '--secret', secret_key3,
+        '--max-buckets', '4'
+        ],
+        check_status=True)
+
+    # create a bucket
+    bucket = connection3.create_bucket(bucket_name + '6')
+
+    rl.log_and_clear("create_bucket", bucket_name + '6', user3)
+
+    # create object
+    object_name1 = 'thirteen'
+    key1 = boto.s3.key.Key(bucket, object_name1)
+    key1.set_contents_from_string(object_name1)
+    rl.log_and_clear("put_obj", bucket_name + '6', user3)
+
+    # rename user3
+    (err, out) = rgwadmin(ctx, client, ['user', 'rename', '--uid', user3, '--new-uid', user4], check_status=True)
+    assert out['user_id'] == user4
+    assert out['keys'][0]['access_key'] == access_key3
+    assert out['keys'][0]['secret_key'] == secret_key3
+
+    time.sleep(5)
+
+    # get bucket and object to test if user keys are preserved
+    bucket = connection3.get_bucket(bucket_name + '6')
+    s = key1.get_contents_as_string(encoding='ascii')
+    rl.log_and_clear("get_obj", bucket_name + '6', user4)
+    assert s == object_name1
+
+    # TESTCASE 'user-rename', 'user', 'rename', 'existing user', 'another existing user', 'fails'
+    # create a new user user2
+    (err, out) = rgwadmin(ctx, client, [
+        'user', 'create',
+        '--uid', user2,
+        '--display-name', display_name2,
+        '--access-key', access_key2,
+        '--secret', secret_key2,
+        '--max-buckets', '4'
+        ],
+        check_status=True)
+
+    # create a bucket
+    bucket = connection2.create_bucket(bucket_name + '7')
+
+    rl.log_and_clear("create_bucket", bucket_name + '7', user2)
+
+    # create object
+    object_name2 = 'fourteen'
+    key2 = boto.s3.key.Key(bucket, object_name2)
+    key2.set_contents_from_string(object_name2)
+    rl.log_and_clear("put_obj", bucket_name + '7', user2)
+
+    (err, out) = rgwadmin(ctx, client, ['user', 'rename', '--uid', user4, '--new-uid', user2])
+    assert err
+
+    # test if user 2 and user4 can still access their bucket and objects after rename fails
+    bucket = connection3.get_bucket(bucket_name + '6')
+    s = key1.get_contents_as_string(encoding='ascii')
+    rl.log_and_clear("get_obj", bucket_name + '6', user4)
+    assert s == object_name1
+
+    bucket = connection2.get_bucket(bucket_name + '7')
+    s = key2.get_contents_as_string(encoding='ascii')
+    rl.log_and_clear("get_obj", bucket_name + '7', user2)
+    assert s == object_name2
+
+    (err, out) = rgwadmin(ctx, client,
+    ['user', 'rm', '--uid', user4, '--purge-data' ],
+    check_status=True)
+
+    (err, out) = rgwadmin(ctx, client,
+    ['user', 'rm', '--uid', user2, '--purge-data' ],
+    check_status=True)
+
+    time.sleep(5)
+
+    # should be all through with connection. (anything using connection
+    #  should be BEFORE the usage stuff above.)
+    rl.log_and_clear("(before-close)", '-', '-', ignore_this_entry)
+    connection.close()
+    connection = None
+
+    # the usage flush interval is 30 seconds, wait that much an then some
+    # to make sure everything has been flushed
+    time.sleep(35)
+
+    # TESTCASE 'usage-trim' 'usage' 'trim' 'user usage' 'succeeds, usage removed'
+    (err, out) = rgwadmin(ctx, client, ['usage', 'trim', '--uid', user1],
+        check_status=True)
+    (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1],
+        check_status=True)
+    assert len(out['entries']) == 0
+    assert len(out['summary']) == 0
+
+    (err, out) = rgwadmin(ctx, client,
+        ['user', 'rm', '--uid', user1, '--purge-data' ],
+        check_status=True)
+
+    # TESTCASE 'rm-user3','user','rm','deleted user','fails'
+    (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1])
+    assert err
+
+    # TESTCASE 'zone-info', 'zone', 'get', 'get zone info', 'succeeds, has default placement rule'
+    #
+
+    (err, out) = rgwadmin(ctx, client, ['zone', 'get','--rgw-zone','default'])
+    orig_placement_pools = len(out['placement_pools'])
+
+    # removed this test, it is not correct to assume that zone has default placement, it really
+    # depends on how we set it up before
+    #
+    # assert len(out) > 0
+    # assert len(out['placement_pools']) == 1
+
+    # default_rule = out['placement_pools'][0]
+    # assert default_rule['key'] == 'default-placement'
+
+    rule={'key': 'new-placement', 'val': {'data_pool': '.rgw.buckets.2', 'index_pool': '.rgw.buckets.index.2'}}
+
+    out['placement_pools'].append(rule)
+
+    (err, out) = rgwadmin(ctx, client, ['zone', 'set'],
+        stdin=BytesIO(json.dumps(out).encode()),
+        check_status=True)
+
+    (err, out) = rgwadmin(ctx, client, ['zone', 'get'])
+    assert len(out) > 0
+    assert len(out['placement_pools']) == orig_placement_pools + 1
+
+    zonecmd = ['zone', 'placement', 'rm',
+               '--rgw-zone', 'default',
+               '--placement-id', 'new-placement']
+
+    (err, out) = rgwadmin(ctx, client, zonecmd, check_status=True)
+
+    # TESTCASE 'zonegroup-info', 'zonegroup', 'get', 'get zonegroup info', 'succeeds'
+    (err, out) = rgwadmin(ctx, client, ['zonegroup', 'get'], check_status=True)
+
+from teuthology.config import config
+from teuthology.orchestra import cluster, remote
+import argparse;
+
+def main():
+    if len(sys.argv) == 3:
+        user = sys.argv[1] + "@"
+        host = sys.argv[2]
+    elif len(sys.argv) == 2:
+        user = ""
+        host = sys.argv[1]
+    else:
+        sys.stderr.write("usage: radosgw_admin.py [user] host\n")
+        exit(1)
+    client0 = remote.Remote(user + host)
+    ctx = config
+    ctx.cluster=cluster.Cluster(remotes=[(client0,
+        [ 'ceph.client.rgw.%s' % (host),  ]),])
+    ctx.rgw = argparse.Namespace()
+    endpoints = {}
+    endpoints['ceph.client.rgw.%s' % host] = (host, 80)
+    ctx.rgw.role_endpoints = endpoints
+    ctx.rgw.realm = None
+    ctx.rgw.regions = {'region0': { 'api name': 'api1',
+        'is master': True, 'master zone': 'r0z0',
+        'zones': ['r0z0', 'r0z1'] }}
+    ctx.rgw.config = {'ceph.client.rgw.%s' % host: {'system user': {'name': '%s-system-user' % host}}}
+    task(config, None)
+    exit()
+
+if __name__ == '__main__':
+    main()
diff --git a/qa/tasks/radosgw_admin_rest.py b/qa/tasks/radosgw_admin_rest.py
new file mode 100644
index 000000000..95fe5b8ac
--- /dev/null
+++ b/qa/tasks/radosgw_admin_rest.py
@@ -0,0 +1,721 @@
+"""
+Run a series of rgw admin commands through the rest interface.
+
+The test cases in this file have been annotated for inventory.
+To extract the inventory (in csv format) use the command:
+
+   grep '^ *# TESTCASE' | sed 's/^ *# TESTCASE //'
+
+"""
+import logging
+
+
+import boto.exception
+import boto.s3.connection
+import boto.s3.acl
+
+import requests
+import time
+
+from boto.connection import AWSAuthConnection
+from teuthology import misc as teuthology
+from tasks.util.rgw import get_user_summary, get_user_successful_ops, rgwadmin
+
+log = logging.getLogger(__name__)
+
+def rgwadmin_rest(connection, cmd, params=None, headers=None, raw=False):
+    """
+    perform a rest command
+    """
+    log.info('radosgw-admin-rest: %s %s' % (cmd, params))
+    put_cmds = ['create', 'link', 'add']
+    post_cmds = ['unlink', 'modify']
+    delete_cmds = ['trim', 'rm', 'process']
+    get_cmds = ['check', 'info', 'show', 'list']
+
+    bucket_sub_resources = ['object', 'policy', 'index']
+    user_sub_resources = ['subuser', 'key', 'caps']
+    zone_sub_resources = ['pool', 'log', 'garbage']
+
+    def get_cmd_method_and_handler(cmd):
+        """
+        Get the rest command and handler from information in cmd and
+        from the imported requests object.
+        """
+        if cmd[1] in put_cmds:
+            return 'PUT', requests.put
+        elif cmd[1] in delete_cmds:
+            return 'DELETE', requests.delete
+        elif cmd[1] in post_cmds:
+            return 'POST', requests.post
+        elif cmd[1] in get_cmds:
+            return 'GET', requests.get
+
+    def get_resource(cmd):
+        """
+        Get the name of the resource from information in cmd.
+        """
+        if cmd[0] == 'bucket' or cmd[0] in bucket_sub_resources:
+            if cmd[0] == 'bucket':
+                return 'bucket', ''
+            else:
+                return 'bucket', cmd[0]
+        elif cmd[0] == 'user' or cmd[0] in user_sub_resources:
+            if cmd[0] == 'user':
+                return 'user', ''
+            else:
+                return 'user', cmd[0]
+        elif cmd[0] == 'usage':
+            return 'usage', ''
+        elif cmd[0] == 'zone' or cmd[0] in zone_sub_resources:
+            if cmd[0] == 'zone':
+                return 'zone', ''
+            else:
+                return 'zone', cmd[0]
+
+    def build_admin_request(conn, method, resource = '', headers=None, data='',
+            query_args=None, params=None):
+        """
+        Build an administative request adapted from the build_request()
+        method of boto.connection
+        """
+
+        path = conn.calling_format.build_path_base('admin', resource)
+        auth_path = conn.calling_format.build_auth_path('admin', resource)
+        host = conn.calling_format.build_host(conn.server_name(), 'admin')
+        if query_args:
+            path += '?' + query_args
+            boto.log.debug('path=%s' % path)
+            auth_path += '?' + query_args
+            boto.log.debug('auth_path=%s' % auth_path)
+        return AWSAuthConnection.build_base_http_request(conn, method, path,
+                auth_path, params, headers, data, host)
+
+    method, handler = get_cmd_method_and_handler(cmd)
+    resource, query_args = get_resource(cmd)
+    request = build_admin_request(connection, method, resource,
+            query_args=query_args, headers=headers)
+
+    url = '{protocol}://{host}{path}'.format(protocol=request.protocol,
+            host=request.host, path=request.path)
+
+    request.authorize(connection=connection)
+    result = handler(url, params=params, headers=request.headers)
+
+    if raw:
+        log.info(' text result: %s' % result.text)
+        return result.status_code, result.text
+    elif len(result.content) == 0:
+        # many admin requests return no body, so json() throws a JSONDecodeError
+        log.info(' empty result')
+        return result.status_code, None
+    else:
+        log.info(' json result: %s' % result.json())
+        return result.status_code, result.json()
+
+
+def task(ctx, config):
+    """
+    Test radosgw-admin functionality through the RESTful interface
+    """
+    assert config is None or isinstance(config, list) \
+        or isinstance(config, dict), \
+        "task s3tests only supports a list or dictionary for configuration"
+    all_clients = ['client.{id}'.format(id=id_)
+                   for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    if config is None:
+        config = all_clients
+    if isinstance(config, list):
+        config = dict.fromkeys(config)
+    clients = config.keys()
+
+    # just use the first client...
+    client = next(iter(clients))
+
+    ##
+    admin_user = 'ada'
+    admin_display_name = 'Ms. Admin User'
+    admin_access_key = 'MH1WC2XQ1S8UISFDZC8W'
+    admin_secret_key = 'dQyrTPA0s248YeN5bBv4ukvKU0kh54LWWywkrpoG'
+    admin_caps = 'users=read, write; usage=read, write; buckets=read, write; zone=read, write'
+
+    user1 = 'foo'
+    user2 = 'fud'
+    subuser1 = 'foo:foo1'
+    subuser2 = 'foo:foo2'
+    display_name1 = 'Foo'
+    display_name2 = 'Fud'
+    email = 'foo@foo.com'
+    access_key = '9te6NH5mcdcq0Tc5i8i1'
+    secret_key = 'Ny4IOauQoL18Gp2zM7lC1vLmoawgqcYP/YGcWfXu'
+    access_key2 = 'p5YnriCv1nAtykxBrupQ'
+    secret_key2 = 'Q8Tk6Q/27hfbFSYdSkPtUqhqx1GgzvpXa4WARozh'
+    swift_secret1 = 'gpS2G9RREMrnbqlp29PP2D36kgPR1tm72n5fPYfL'
+    swift_secret2 = 'ri2VJQcKSYATOY6uaDUX7pxgkW+W1YmC6OCxPHwy'
+
+    bucket_name = 'myfoo'
+
+    # legend (test cases can be easily grep-ed out)
+    # TESTCASE 'testname','object','method','operation','assertion'
+    # TESTCASE 'create-admin-user','user','create','administrative user','succeeds'
+    (err, out) = rgwadmin(ctx, client, [
+            'user', 'create',
+            '--uid', admin_user,
+            '--display-name', admin_display_name,
+            '--access-key', admin_access_key,
+            '--secret', admin_secret_key,
+            '--max-buckets', '0',
+            '--caps', admin_caps
+            ])
+    logging.error(out)
+    logging.error(err)
+    assert not err
+
+    assert hasattr(ctx, 'rgw'), 'radosgw-admin-rest must run after the rgw task'
+    endpoint = ctx.rgw.role_endpoints.get(client)
+    assert endpoint, 'no rgw endpoint for {}'.format(client)
+
+    admin_conn = boto.s3.connection.S3Connection(
+        aws_access_key_id=admin_access_key,
+        aws_secret_access_key=admin_secret_key,
+        is_secure=True if endpoint.cert else False,
+        port=endpoint.port,
+        host=endpoint.hostname,
+        calling_format=boto.s3.connection.OrdinaryCallingFormat(),
+        )
+
+    # TESTCASE 'info-nosuch','user','info','non-existent user','fails'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {"uid": user1})
+    assert ret == 404
+
+    # TESTCASE 'create-ok','user','create','w/all valid info','succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['user', 'create'],
+            {'uid' : user1,
+             'display-name' :  display_name1,
+             'email' : email,
+             'access-key' : access_key,
+             'secret-key' : secret_key,
+             'max-buckets' : '4'
+            })
+
+    assert ret == 200
+
+    # TESTCASE 'list-no-user','user','list','list user keys','user list object'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'list'], {'list' : '', 'max-entries' : 0})
+    assert ret == 200
+    assert out['count'] == 0
+    assert out['truncated'] == True
+    assert len(out['keys']) == 0
+    assert len(out['marker']) > 0
+
+    # TESTCASE 'list-user-without-marker','user','list','list user keys','user list object'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'list'], {'list' : '', 'max-entries' : 1})
+    assert ret == 200
+    assert out['count'] == 1
+    assert out['truncated'] == True
+    assert len(out['keys']) == 1
+    assert len(out['marker']) > 0
+    marker = out['marker']
+
+    # TESTCASE 'list-user-with-marker','user','list','list user keys','user list object'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'list'], {'list' : '', 'max-entries' : 1, 'marker': marker})
+    assert ret == 200
+    assert out['count'] == 1
+    assert out['truncated'] == False
+    assert len(out['keys']) == 1
+
+    # TESTCASE 'info-existing','user','info','existing user','returns correct info'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+
+    assert out['user_id'] == user1
+    assert out['email'] == email
+    assert out['display_name'] == display_name1
+    assert len(out['keys']) == 1
+    assert out['keys'][0]['access_key'] == access_key
+    assert out['keys'][0]['secret_key'] == secret_key
+    assert not out['suspended']
+    assert out['tenant'] == ''
+    assert out['max_buckets'] == 4
+    assert out['caps'] == []
+    assert out['op_mask'] == 'read, write, delete'
+    assert out['default_placement'] == ''
+    assert out['default_storage_class'] == ''
+    assert out['placement_tags'] == []
+    assert not out['bucket_quota']['enabled']
+    assert not out['bucket_quota']['check_on_raw']
+    assert out['bucket_quota']['max_size'] == -1
+    assert out['bucket_quota']['max_size_kb'] == 0
+    assert out['bucket_quota']['max_objects'] == -1
+    assert not out['user_quota']['enabled']
+    assert not out['user_quota']['check_on_raw']
+    assert out['user_quota']['max_size'] == -1
+    assert out['user_quota']['max_size_kb'] == 0
+    assert out['user_quota']['max_objects'] == -1
+    assert out['temp_url_keys'] == []
+    assert out['type'] == 'rgw'
+    assert out['mfa_ids'] == []
+    # TESTCASE 'info-existing','user','info','existing user query with wrong uid but correct access key','returns correct info'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'access-key' : access_key, 'uid': 'uid_not_exist'})
+
+    assert out['user_id'] == user1
+    assert out['email'] == email
+    assert out['display_name'] == display_name1
+    assert len(out['keys']) == 1
+    assert out['keys'][0]['access_key'] == access_key
+    assert out['keys'][0]['secret_key'] == secret_key
+    assert not out['suspended']
+    assert out['tenant'] == ''
+    assert out['max_buckets'] == 4
+    assert out['caps'] == []
+    assert out['op_mask'] == "read, write, delete"
+    assert out['default_placement'] == ''
+    assert out['default_storage_class'] == ''
+    assert out['placement_tags'] == []
+    assert not out['bucket_quota']['enabled']
+    assert not out['bucket_quota']['check_on_raw']
+    assert out ['bucket_quota']['max_size'] == -1
+    assert out ['bucket_quota']['max_size_kb'] == 0
+    assert out ['bucket_quota']['max_objects'] == -1
+    assert not out['user_quota']['enabled']
+    assert not out['user_quota']['check_on_raw']
+    assert out['user_quota']['max_size'] == -1
+    assert out['user_quota']['max_size_kb'] == 0
+    assert out['user_quota']['max_objects'] == -1
+    assert out['temp_url_keys'] == []
+    assert out['type'] == 'rgw'
+    assert out['mfa_ids'] == []
+
+    # TESTCASE 'suspend-ok','user','suspend','active user','succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : True})
+    assert ret == 200
+
+    # TESTCASE 'suspend-suspended','user','suspend','suspended user','succeeds w/advisory'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+    assert ret == 200
+    assert out['suspended']
+    assert out['email'] == email
+
+    # TESTCASE 're-enable','user','enable','suspended user','succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : 'false'})
+    assert not err
+
+    # TESTCASE 'info-re-enabled','user','info','re-enabled user','no longer suspended'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+    assert ret == 200
+    assert not out['suspended']
+
+    # TESTCASE 'add-keys','key','create','w/valid info','succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['key', 'create'],
+            {'uid' : user1,
+             'access-key' : access_key2,
+             'secret-key' : secret_key2
+            })
+
+
+    assert ret == 200
+
+    # TESTCASE 'info-new-key','user','info','after key addition','returns all keys'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+    assert ret == 200
+    assert len(out['keys']) == 2
+    assert out['keys'][0]['access_key'] == access_key2 or out['keys'][1]['access_key'] == access_key2
+    assert out['keys'][0]['secret_key'] == secret_key2 or out['keys'][1]['secret_key'] == secret_key2
+
+    # TESTCASE 'rm-key','key','rm','newly added key','succeeds, key is removed'
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['key', 'rm'],
+            {'uid' : user1,
+             'access-key' : access_key2
+            })
+
+    assert ret == 200
+
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+
+    assert len(out['keys']) == 1
+    assert out['keys'][0]['access_key'] == access_key
+    assert out['keys'][0]['secret_key'] == secret_key
+
+    # TESTCASE 'add-swift-key','key','create','swift key','succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['subuser', 'create'],
+            {'subuser' : subuser1,
+             'secret-key' : swift_secret1,
+             'key-type' : 'swift'
+            })
+
+    assert ret == 200
+
+    # TESTCASE 'info-swift-key','user','info','after key addition','returns all keys'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+    assert ret == 200
+    assert len(out['swift_keys']) == 1
+    assert out['swift_keys'][0]['user'] == subuser1
+    assert out['swift_keys'][0]['secret_key'] == swift_secret1
+
+    # TESTCASE 'add-swift-subuser','key','create','swift sub-user key','succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['subuser', 'create'],
+            {'subuser' : subuser2,
+             'secret-key' : swift_secret2,
+             'key-type' : 'swift'
+            })
+
+    assert ret == 200
+
+    # TESTCASE 'info-swift-subuser','user','info','after key addition','returns all sub-users/keys'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' :  user1})
+    assert ret == 200
+    assert len(out['swift_keys']) == 2
+    assert out['swift_keys'][0]['user'] == subuser2 or out['swift_keys'][1]['user'] == subuser2
+    assert out['swift_keys'][0]['secret_key'] == swift_secret2 or out['swift_keys'][1]['secret_key'] == swift_secret2
+
+    # TESTCASE 'rm-swift-key1','key','rm','subuser','succeeds, one key is removed'
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['key', 'rm'],
+            {'subuser' : subuser1,
+             'key-type' :'swift'
+            })
+
+    assert ret == 200
+
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' :  user1})
+    assert len(out['swift_keys']) == 1
+
+    # TESTCASE 'rm-subuser','subuser','rm','subuser','success, subuser is removed'
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['subuser', 'rm'],
+            {'subuser' : subuser1
+            })
+
+    assert ret == 200
+
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' :  user1})
+    assert len(out['subusers']) == 1
+
+    # TESTCASE 'rm-subuser-with-keys','subuser','rm','subuser','succeeds, second subser and key is removed'
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['subuser', 'rm'],
+            {'subuser' : subuser2,
+             'key-type' : 'swift',
+             '{purge-keys' :True
+            })
+
+    assert ret == 200
+
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' :  user1})
+    assert len(out['swift_keys']) == 0
+    assert len(out['subusers']) == 0
+
+    # TESTCASE 'bucket-stats','bucket','info','no session/buckets','succeeds, empty list'
+    (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' :  user1})
+    assert ret == 200
+    assert len(out) == 0
+
+    # connect to rgw
+    connection = boto.s3.connection.S3Connection(
+        aws_access_key_id=access_key,
+        aws_secret_access_key=secret_key,
+        is_secure=True if endpoint.cert else False,
+        port=endpoint.port,
+        host=endpoint.hostname,
+        calling_format=boto.s3.connection.OrdinaryCallingFormat(),
+        )
+
+    # TESTCASE 'bucket-stats2','bucket','stats','no buckets','succeeds, empty list'
+    (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1, 'stats' : True})
+    assert ret == 200
+    assert len(out) == 0
+
+    # create a first bucket
+    bucket = connection.create_bucket(bucket_name)
+
+    # TESTCASE 'bucket-list','bucket','list','one bucket','succeeds, expected list'
+    (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1})
+    assert ret == 200
+    assert len(out) == 1
+    assert out[0] == bucket_name
+
+    # TESTCASE 'bucket-stats3','bucket','stats','new empty bucket','succeeds, empty list'
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['bucket', 'info'], {'bucket' : bucket_name, 'stats' : True})
+
+    assert ret == 200
+    assert out['owner'] == user1
+    assert out['tenant'] == ''
+    bucket_id = out['id']
+
+    # TESTCASE 'bucket-stats4','bucket','stats','new empty bucket','succeeds, expected bucket ID'
+    (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1, 'stats' : True})
+    assert ret == 200
+    assert len(out) == 1
+    assert out[0]['id'] == bucket_id    # does it return the same ID twice in a row?
+
+    # use some space
+    key = boto.s3.key.Key(bucket)
+    key.set_contents_from_string('one')
+
+    # TESTCASE 'bucket-stats5','bucket','stats','after creating key','succeeds, lists one non-empty object'
+    (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'bucket' : bucket_name, 'stats' : True})
+    assert ret == 200
+    assert out['id'] == bucket_id
+    assert out['usage']['rgw.main']['num_objects'] == 1
+    assert out['usage']['rgw.main']['size_kb'] > 0
+
+    # TESTCASE 'bucket-stats6', 'bucket', 'stats', 'non-existent bucket', 'fails, 'bucket not found error'
+    (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'bucket' : 'doesnotexist'})
+    assert ret == 404
+    assert out['Code'] == 'NoSuchBucket'
+
+    # reclaim it
+    key.delete()
+
+    # TESTCASE 'bucket unlink', 'bucket', 'unlink', 'unlink bucket from user', 'fails', 'access denied error'
+    (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'unlink'], {'uid' : user1, 'bucket' : bucket_name})
+
+    assert ret == 200
+
+    # create a second user to link the bucket to
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['user', 'create'],
+            {'uid' : user2,
+            'display-name' :  display_name2,
+            'access-key' : access_key2,
+            'secret-key' : secret_key2,
+            'max-buckets' : '1',
+            })
+
+    assert ret == 200
+
+    # try creating an object with the first user before the bucket is relinked
+    denied = False
+    key = boto.s3.key.Key(bucket)
+
+    try:
+        key.set_contents_from_string('two')
+    except boto.exception.S3ResponseError:
+        denied = True
+
+    assert not denied
+
+    # delete the object
+    key.delete()
+
+    # link the bucket to another user
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['bucket', 'link'],
+            {'uid' : user2,
+             'bucket' : bucket_name,
+             'bucket-id' : bucket_id,
+            })
+
+    assert ret == 200
+
+    # try creating an object with the first user which should cause an error
+    key = boto.s3.key.Key(bucket)
+
+    try:
+        key.set_contents_from_string('three')
+    except boto.exception.S3ResponseError:
+        denied = True
+
+    assert denied
+
+    # relink the bucket to the first user and delete the second user
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['bucket', 'link'],
+            {'uid' : user1,
+             'bucket' : bucket_name,
+             'bucket-id' : bucket_id,
+            })
+    assert ret == 200
+
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user2})
+    assert ret == 200
+
+    # TESTCASE 'object-rm', 'object', 'rm', 'remove object', 'succeeds, object is removed'
+
+    # upload an object
+    object_name = 'four'
+    key = boto.s3.key.Key(bucket, object_name)
+    key.set_contents_from_string(object_name)
+
+    # now delete it
+    (ret, out) = rgwadmin_rest(admin_conn, ['object', 'rm'], {'bucket' : bucket_name, 'object' : object_name})
+    assert ret == 200
+
+    # TESTCASE 'bucket-stats6','bucket','stats','after deleting key','succeeds, lists one no objects'
+    (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'bucket' : bucket_name, 'stats' : True})
+    assert ret == 200
+    assert out['id'] == bucket_id
+    assert out['usage']['rgw.main']['num_objects'] == 0
+
+    # create a bucket for deletion stats
+    useless_bucket = connection.create_bucket('useless-bucket')
+    useless_key = useless_bucket.new_key('useless_key')
+    useless_key.set_contents_from_string('useless string')
+
+    # delete it
+    useless_key.delete()
+    useless_bucket.delete()
+
+    # wait for the statistics to flush
+    time.sleep(60)
+
+    # need to wait for all usage data to get flushed, should take up to 30 seconds
+    timestamp = time.time()
+    while time.time() - timestamp <= (20 * 60):      # wait up to 20 minutes
+        (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'categories' : 'delete_obj'})  # last operation we did is delete obj, wait for it to flush
+
+        if get_user_successful_ops(out, user1) > 0:
+            break
+        time.sleep(1)
+
+    assert time.time() - timestamp <= (20 * 60)
+
+    # TESTCASE 'usage-show' 'usage' 'show' 'all usage' 'succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'])
+    assert ret == 200
+    assert len(out['entries']) > 0
+    assert len(out['summary']) > 0
+    user_summary = get_user_summary(out, user1)
+    total = user_summary['total']
+    assert total['successful_ops'] > 0
+
+    # TESTCASE 'usage-show2' 'usage' 'show' 'user usage' 'succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'uid' : user1})
+    assert ret == 200
+    assert len(out['entries']) > 0
+    assert len(out['summary']) > 0
+    user_summary = out['summary'][0]
+    for entry in user_summary['categories']:
+        assert entry['successful_ops'] > 0
+    assert user_summary['user'] == user1
+
+    # TESTCASE 'usage-show3' 'usage' 'show' 'user usage categories' 'succeeds'
+    test_categories = ['create_bucket', 'put_obj', 'delete_obj', 'delete_bucket']
+    for cat in test_categories:
+        (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'uid' : user1, 'categories' : cat})
+        assert ret == 200
+        assert len(out['summary']) > 0
+        user_summary = out['summary'][0]
+        assert user_summary['user'] == user1
+        assert len(user_summary['categories']) == 1
+        entry = user_summary['categories'][0]
+        assert entry['category'] == cat
+        assert entry['successful_ops'] > 0
+
+    # TESTCASE 'usage-trim' 'usage' 'trim' 'user usage' 'succeeds, usage removed'
+    (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'trim'], {'uid' : user1})
+    assert ret == 200
+    (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'uid' : user1})
+    assert ret == 200
+    assert len(out['entries']) == 0
+    assert len(out['summary']) == 0
+
+    # TESTCASE 'user-suspend2','user','suspend','existing user','succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : True})
+    assert ret == 200
+
+    # TESTCASE 'user-suspend3','user','suspend','suspended user','cannot write objects'
+    try:
+        key = boto.s3.key.Key(bucket)
+        key.set_contents_from_string('five')
+    except boto.exception.S3ResponseError as e:
+        assert e.status == 403
+
+    # TESTCASE 'user-renable2','user','enable','suspended user','succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' :  user1, 'suspended' : 'false'})
+    assert ret == 200
+
+    # TESTCASE 'user-renable3','user','enable','reenabled user','can write objects'
+    key = boto.s3.key.Key(bucket)
+    key.set_contents_from_string('six')
+
+    # TESTCASE 'garbage-list', 'garbage', 'list', 'get list of objects ready for garbage collection'
+
+    # create an object large enough to be split into multiple parts
+    test_string = 'foo'*10000000
+
+    big_key = boto.s3.key.Key(bucket)
+    big_key.set_contents_from_string(test_string)
+
+    # now delete the head
+    big_key.delete()
+
+    # TESTCASE 'rm-user-buckets','user','rm','existing user','fails, still has buckets'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user1})
+    assert ret == 409
+
+    # delete should fail because ``key`` still exists
+    try:
+        bucket.delete()
+    except boto.exception.S3ResponseError as e:
+        assert e.status == 409
+
+    key.delete()
+    bucket.delete()
+
+    # TESTCASE 'policy', 'bucket', 'policy', 'get bucket policy', 'returns S3 policy'
+    bucket = connection.create_bucket(bucket_name)
+
+    # create an object
+    key = boto.s3.key.Key(bucket)
+    key.set_contents_from_string('seven')
+
+    # should be private already but guarantee it
+    key.set_acl('private')
+
+    (ret, out) = rgwadmin_rest(admin_conn, ['policy', 'show'], {'bucket' : bucket.name, 'object' : key.key})
+    assert ret == 200
+    assert len(out['acl']['grant_map']) == 1
+
+    # add another grantee by making the object public read
+    key.set_acl('public-read')
+
+    (ret, out) = rgwadmin_rest(admin_conn, ['policy', 'show'], {'bucket' : bucket.name, 'object' : key.key})
+    assert ret == 200
+    assert len(out['acl']['grant_map']) == 2
+
+    # TESTCASE 'rm-bucket', 'bucket', 'rm', 'bucket with objects', 'succeeds'
+    bucket = connection.create_bucket(bucket_name)
+    key_name = ['eight', 'nine', 'ten', 'eleven']
+    for i in range(4):
+        key = boto.s3.key.Key(bucket)
+        key.set_contents_from_string(key_name[i])
+
+    (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'rm'], {'bucket' : bucket_name, 'purge-objects' : True})
+    assert ret == 200
+
+    # TESTCASE 'caps-add', 'caps', 'add', 'add user cap', 'succeeds'
+    caps = 'usage=read'
+    (ret, out) = rgwadmin_rest(admin_conn, ['caps', 'add'], {'uid' :  user1, 'user-caps' : caps})
+    assert ret == 200
+    assert out[0]['perm'] == 'read'
+
+    # TESTCASE 'caps-rm', 'caps', 'rm', 'remove existing cap from user', 'succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn, ['caps', 'rm'], {'uid' :  user1, 'user-caps' : caps})
+    assert ret == 200
+    assert not out
+
+    # TESTCASE 'rm-user','user','rm','existing user','fails, still has buckets'
+    bucket = connection.create_bucket(bucket_name)
+    key = boto.s3.key.Key(bucket)
+
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user1})
+    assert ret == 409
+
+    # TESTCASE 'rm-user2', 'user', 'rm', user with data', 'succeeds'
+    bucket = connection.create_bucket(bucket_name)
+    key = boto.s3.key.Key(bucket)
+    key.set_contents_from_string('twelve')
+
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user1, 'purge-data' : True})
+    assert ret == 200
+
+    # TESTCASE 'rm-user3','user','info','deleted user','fails'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' :  user1})
+    assert ret == 404
+
diff --git a/qa/tasks/ragweed.py b/qa/tasks/ragweed.py
new file mode 100644
index 000000000..eae1806c1
--- /dev/null
+++ b/qa/tasks/ragweed.py
@@ -0,0 +1,376 @@
+"""
+Run a set of s3 tests on rgw.
+"""
+from io import BytesIO
+from configobj import ConfigObj
+import base64
+import contextlib
+import logging
+import os
+import random
+import string
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.config import config as teuth_config
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+
+def get_ragweed_branches(config, client_conf):
+    """
+    figure out the ragweed branch according to the per-client settings
+
+    use force-branch is specified, and fall back to the ones deduced using ceph
+    branch under testing
+    """
+    force_branch = client_conf.get('force-branch', None)
+    if force_branch:
+        return [force_branch]
+    else:
+        S3_BRANCHES = ['master', 'nautilus', 'mimic',
+                       'luminous', 'kraken', 'jewel']
+        ceph_branch = config.get('branch')
+        suite_branch = config.get('suite_branch', ceph_branch)
+        if suite_branch in S3_BRANCHES:
+            branch = client_conf.get('branch', 'ceph-' + suite_branch)
+        else:
+            branch = client_conf.get('branch', suite_branch)
+        default_branch = client_conf.get('default-branch', None)
+        if default_branch:
+            return [branch, default_branch]
+        else:
+            return [branch]
+
+
+@contextlib.contextmanager
+def download(ctx, config):
+    """
+    Download the s3 tests from the git builder.
+    Remove downloaded s3 file upon exit.
+
+    The context passed in should be identical to the context
+    passed in to the main task.
+    """
+    assert isinstance(config, dict)
+    log.info('Downloading ragweed...')
+    testdir = teuthology.get_testdir(ctx)
+    for (client, cconf) in config.items():
+        ragweed_repo = ctx.config.get('ragweed_repo',
+                                      teuth_config.ceph_git_base_url + 'ragweed.git')
+        for branch in get_ragweed_branches(ctx.config, cconf):
+            log.info("Using branch '%s' for ragweed", branch)
+            try:
+                ctx.cluster.only(client).sh(
+                    script=f'git clone -b {branch} {ragweed_repo} {testdir}/ragweed')
+                break
+            except Exception as e:
+                exc = e
+        else:
+            raise exc
+
+        sha1 = cconf.get('sha1')
+        if sha1 is not None:
+            ctx.cluster.only(client).run(
+                args=[
+                    'cd', '{tdir}/ragweed'.format(tdir=testdir),
+                    run.Raw('&&'),
+                    'git', 'reset', '--hard', sha1,
+                    ],
+                )
+    try:
+        yield
+    finally:
+        log.info('Removing ragweed...')
+        testdir = teuthology.get_testdir(ctx)
+        for client in config:
+            ctx.cluster.only(client).run(
+                args=[
+                    'rm',
+                    '-rf',
+                    '{tdir}/ragweed'.format(tdir=testdir),
+                    ],
+                )
+
+
+def _config_user(ragweed_conf, section, user):
+    """
+    Configure users for this section by stashing away keys, ids, and
+    email addresses.
+    """
+    ragweed_conf[section].setdefault('user_id', user)
+    ragweed_conf[section].setdefault('email', '{user}+test@test.test'.format(user=user))
+    ragweed_conf[section].setdefault('display_name', 'Mr. {user}'.format(user=user))
+    ragweed_conf[section].setdefault('access_key', ''.join(random.choice(string.ascii_uppercase) for i in range(20)))
+    ragweed_conf[section].setdefault('secret_key', base64.b64encode(os.urandom(40)).decode('ascii'))
+
+
+@contextlib.contextmanager
+def create_users(ctx, config, run_stages):
+    """
+    Create a main and an alternate s3 user.
+    """
+    assert isinstance(config, dict)
+
+    for client, properties in config['config'].items():
+        run_stages[client] = properties.get('stages', 'prepare,check').split(',')
+
+    log.info('Creating rgw users...')
+    testdir = teuthology.get_testdir(ctx)
+    users = {'user regular': 'ragweed', 'user system': 'sysuser'}
+    for client in config['clients']:
+        if not 'prepare' in run_stages[client]:
+            # should have been prepared in a previous run
+            continue
+
+        ragweed_conf = config['ragweed_conf'][client]
+        ragweed_conf.setdefault('fixtures', {})
+        ragweed_conf['rgw'].setdefault('bucket_prefix', 'test-' + client)
+        for section, user in users.items():
+            _config_user(ragweed_conf, section, '{user}.{client}'.format(user=user, client=client))
+            log.debug('Creating user {user} on {host}'.format(user=ragweed_conf[section]['user_id'], host=client))
+            if user == 'sysuser':
+                sys_str = 'true'
+            else:
+                sys_str = 'false'
+            ctx.cluster.only(client).run(
+                args=[
+                    'adjust-ulimits',
+                    'ceph-coverage',
+                    '{tdir}/archive/coverage'.format(tdir=testdir),
+                    'radosgw-admin',
+                    '-n', client,
+                    'user', 'create',
+                    '--uid', ragweed_conf[section]['user_id'],
+                    '--display-name', ragweed_conf[section]['display_name'],
+                    '--access-key', ragweed_conf[section]['access_key'],
+                    '--secret', ragweed_conf[section]['secret_key'],
+                    '--email', ragweed_conf[section]['email'],
+                    '--system', sys_str,
+                ],
+            )
+    try:
+        yield
+    finally:
+        for client in config['clients']:
+            if not 'check' in run_stages[client]:
+                # only remove user if went through the check stage
+                continue
+            for user in users.values():
+                uid = '{user}.{client}'.format(user=user, client=client)
+                ctx.cluster.only(client).run(
+                    args=[
+                        'adjust-ulimits',
+                        'ceph-coverage',
+                        '{tdir}/archive/coverage'.format(tdir=testdir),
+                        'radosgw-admin',
+                        '-n', client,
+                        'user', 'rm',
+                        '--uid', uid,
+                        '--purge-data',
+                        ],
+                    )
+
+
+@contextlib.contextmanager
+def configure(ctx, config, run_stages):
+    """
+    Configure the ragweed.  This includes the running of the
+    bootstrap code and the updating of local conf files.
+    """
+    assert isinstance(config, dict)
+    log.info('Configuring ragweed...')
+    testdir = teuthology.get_testdir(ctx)
+    for client, properties in config['clients'].items():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        remote.run(
+            args=[
+                'cd',
+                '{tdir}/ragweed'.format(tdir=testdir),
+                run.Raw('&&'),
+                './bootstrap',
+                ],
+            )
+
+        preparing = 'prepare' in run_stages[client]
+        if not preparing:
+            # should have been prepared in a previous run
+            continue
+
+        ragweed_conf = config['ragweed_conf'][client]
+        if properties is not None and 'slow_backend' in properties:
+            ragweed_conf['fixtures']['slow backend'] = properties['slow_backend']
+
+        conf_fp = BytesIO()
+        ragweed_conf.write(conf_fp)
+        remote.write_file(
+            path='{tdir}/archive/ragweed.{client}.conf'.format(tdir=testdir, client=client),
+            data=conf_fp.getvalue(),
+            )
+
+    log.info('Configuring boto...')
+    boto_src = os.path.join(os.path.dirname(__file__), 'boto.cfg.template')
+    for client, properties in config['clients'].items():
+        with open(boto_src, 'r') as f:
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+            conf = f.read().format(
+                idle_timeout=config.get('idle_timeout', 30)
+                )
+            remote.write_file('{tdir}/boto.cfg'.format(tdir=testdir), conf)
+
+    try:
+        yield
+
+    finally:
+        log.info('Cleaning up boto...')
+        for client, properties in config['clients'].items():
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+            remote.run(
+                args=[
+                    'rm',
+                    '{tdir}/boto.cfg'.format(tdir=testdir),
+                    ],
+                )
+
+@contextlib.contextmanager
+def run_tests(ctx, config, run_stages):
+    """
+    Run the ragweed after everything is set up.
+
+    :param ctx: Context passed to task
+    :param config: specific configuration information
+    """
+    assert isinstance(config, dict)
+    testdir = teuthology.get_testdir(ctx)
+    attrs = ["!fails_on_rgw"]
+    for client, client_config in config.items():
+        stages = ','.join(run_stages[client])
+        args = [
+            'RAGWEED_CONF={tdir}/archive/ragweed.{client}.conf'.format(tdir=testdir, client=client),
+            'RAGWEED_STAGES={stages}'.format(stages=stages),
+            'BOTO_CONFIG={tdir}/boto.cfg'.format(tdir=testdir),
+            '{tdir}/ragweed/virtualenv/bin/python'.format(tdir=testdir),
+            '-m', 'nose',
+            '-w',
+            '{tdir}/ragweed'.format(tdir=testdir),
+            '-v',
+            '-a', ','.join(attrs),
+            ]
+        if client_config is not None and 'extra_args' in client_config:
+            args.extend(client_config['extra_args'])
+
+        ctx.cluster.only(client).run(
+            args=args,
+            label="ragweed tests against rgw"
+            )
+    yield
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run the ragweed suite against rgw.
+
+    To run all tests on all clients::
+
+        tasks:
+        - ceph:
+        - rgw:
+        - ragweed:
+
+    To restrict testing to particular clients::
+
+        tasks:
+        - ceph:
+        - rgw: [client.0]
+        - ragweed: [client.0]
+
+    To run against a server on client.1 and increase the boto timeout to 10m::
+
+        tasks:
+        - ceph:
+        - rgw: [client.1]
+        - ragweed:
+            client.0:
+              rgw_server: client.1
+              idle_timeout: 600
+              stages: prepare,check
+
+    To pass extra arguments to nose (e.g. to run a certain test)::
+
+        tasks:
+        - ceph:
+        - rgw: [client.0]
+        - ragweed:
+            client.0:
+              extra_args: ['test_s3:test_object_acl_grand_public_read']
+            client.1:
+              extra_args: ['--exclude', 'test_100_continue']
+    """
+    assert hasattr(ctx, 'rgw'), 'ragweed must run after the rgw task'
+    assert config is None or isinstance(config, list) \
+        or isinstance(config, dict), \
+        "task ragweed only supports a list or dictionary for configuration"
+    all_clients = ['client.{id}'.format(id=id_)
+                   for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    if config is None:
+        config = all_clients
+    if isinstance(config, list):
+        config = dict.fromkeys(config)
+    clients = config.keys()
+
+    overrides = ctx.config.get('overrides', {})
+    # merge each client section, not the top level.
+    for client in config.keys():
+        if not config[client]:
+            config[client] = {}
+        teuthology.deep_merge(config[client], overrides.get('ragweed', {}))
+
+    log.debug('ragweed config is %s', config)
+
+    ragweed_conf = {}
+    for client in clients:
+        # use rgw_server endpoint if given, or default to same client
+        target = config[client].get('rgw_server', client)
+
+        endpoint = ctx.rgw.role_endpoints.get(target)
+        assert endpoint, 'ragweed: no rgw endpoint for {}'.format(target)
+
+        ragweed_conf[client] = ConfigObj(
+            indent_type='',
+            infile={
+                'rgw':
+                    {
+                    'host'      : endpoint.dns_name,
+                    'port'      : endpoint.port,
+                    'is_secure' : endpoint.cert is not None,
+                    },
+                'fixtures' : {},
+                'user system'  : {},
+                'user regular'   : {},
+                'rados':
+                    {
+                    'ceph_conf'  : '/etc/ceph/ceph.conf',
+                    },
+                }
+            )
+
+    run_stages = {}
+
+    with contextutil.nested(
+        lambda: download(ctx=ctx, config=config),
+        lambda: create_users(ctx=ctx, config=dict(
+                clients=clients,
+                ragweed_conf=ragweed_conf,
+                config=config,
+                ),
+                run_stages=run_stages),
+        lambda: configure(ctx=ctx, config=dict(
+                clients=config,
+                ragweed_conf=ragweed_conf,
+                ),
+                run_stages=run_stages),
+        lambda: run_tests(ctx=ctx, config=config, run_stages=run_stages),
+        ):
+        pass
+    yield
diff --git a/qa/tasks/rbd.py b/qa/tasks/rbd.py
new file mode 100644
index 000000000..6b9786f22
--- /dev/null
+++ b/qa/tasks/rbd.py
@@ -0,0 +1,686 @@
+"""
+Rbd testing task
+"""
+import contextlib
+import logging
+import os
+import tempfile
+import sys
+
+from io import StringIO
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.parallel import parallel
+from teuthology.task.common_fs_utils import generic_mkfs
+from teuthology.task.common_fs_utils import generic_mount
+from teuthology.task.common_fs_utils import default_image_name
+
+
+#V1 image unsupported but required for testing purposes
+os.environ["RBD_FORCE_ALLOW_V1"] = "1"
+
+log = logging.getLogger(__name__)
+
+ENCRYPTION_PASSPHRASE = "password"
+
+@contextlib.contextmanager
+def create_image(ctx, config):
+    """
+    Create an rbd image.
+
+    For example::
+
+        tasks:
+        - ceph:
+        - rbd.create_image:
+            client.0:
+                image_name: testimage
+                image_size: 100
+                image_format: 1
+                encryption_format: luks2
+            client.1:
+
+    Image size is expressed as a number of megabytes; default value
+    is 10240.
+
+    Image format value must be either 1 or 2; default value is 1.
+
+    """
+    assert isinstance(config, dict) or isinstance(config, list), \
+        "task create_image only supports a list or dictionary for configuration"
+
+    if isinstance(config, dict):
+        images = config.items()
+    else:
+        images = [(role, None) for role in config]
+
+    testdir = teuthology.get_testdir(ctx)
+    passphrase_file = '{tdir}/passphrase'.format(tdir=testdir)
+    for role, properties in images:
+        if properties is None:
+            properties = {}
+        name = properties.get('image_name', default_image_name(role))
+        size = properties.get('image_size', 10240)
+        fmt = properties.get('image_format', 1)
+        encryption_format = properties.get('encryption_format', 'none')
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+        log.info('Creating image {name} with size {size}'.format(name=name,
+                                                                 size=size))
+        args = [
+                'adjust-ulimits',
+                'ceph-coverage',
+                '{tdir}/archive/coverage'.format(tdir=testdir),
+                'rbd',
+                '-p', 'rbd',
+                'create',
+                '--size', str(size),
+                name,
+            ]
+        # omit format option if using the default (format 1)
+        # since old versions of don't support it
+        if int(fmt) != 1:
+            args += ['--image-format', str(fmt)]
+        remote.run(args=args)
+
+        if encryption_format != 'none':
+            remote.run(
+                args=[
+                    'echo',
+                    ENCRYPTION_PASSPHRASE,
+                    run.Raw('>'),
+                    passphrase_file
+                    ]
+                )
+            remote.run(
+                args=[
+                    'adjust-ulimits',
+                    'ceph-coverage',
+                    '{tdir}/archive/coverage'.format(tdir=testdir),
+                    'rbd',
+                    'encryption',
+                    'format',
+                    name,
+                    encryption_format,
+                    passphrase_file,
+                    '-p',
+                    'rbd'
+                    ]
+                )
+    try:
+        yield
+    finally:
+        log.info('Deleting rbd images...')
+        remote.run(args=['rm', '-f', passphrase_file])
+        for role, properties in images:
+            if properties is None:
+                properties = {}
+            name = properties.get('image_name', default_image_name(role))
+            (remote,) = ctx.cluster.only(role).remotes.keys()
+            remote.run(
+                args=[
+                    'adjust-ulimits',
+                    'ceph-coverage',
+                    '{tdir}/archive/coverage'.format(tdir=testdir),
+                    'rbd',
+                    '-p', 'rbd',
+                    'rm',
+                    name,
+                    ],
+                )
+
+@contextlib.contextmanager
+def clone_image(ctx, config):
+    """
+    Clones a parent imag
+
+    For example::
+
+        tasks:
+        - ceph:
+        - rbd.clone_image:
+            client.0:
+                parent_name: testimage
+                image_name: cloneimage
+    """
+    assert isinstance(config, dict) or isinstance(config, list), \
+        "task clone_image only supports a list or dictionary for configuration"
+
+    if isinstance(config, dict):
+        images = config.items()
+    else:
+        images = [(role, None) for role in config]
+
+    testdir = teuthology.get_testdir(ctx)
+    for role, properties in images:
+        if properties is None:
+            properties = {}
+
+        name = properties.get('image_name', default_image_name(role))
+        parent_name = properties.get('parent_name')
+        assert parent_name is not None, \
+            "parent_name is required"
+        parent_spec = '{name}@{snap}'.format(name=parent_name, snap=name)
+
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+        log.info('Clone image {parent} to {child}'.format(parent=parent_name,
+                                                          child=name))
+        for cmd in [('snap', 'create', parent_spec),
+                    ('snap', 'protect', parent_spec),
+                    ('clone', parent_spec, name)]:
+            args = [
+                    'adjust-ulimits',
+                    'ceph-coverage',
+                    '{tdir}/archive/coverage'.format(tdir=testdir),
+                    'rbd', '-p', 'rbd'
+                    ]
+            args.extend(cmd)
+            remote.run(args=args)
+
+    try:
+        yield
+    finally:
+        log.info('Deleting rbd clones...')
+        for role, properties in images:
+            if properties is None:
+                properties = {}
+            name = properties.get('image_name', default_image_name(role))
+            parent_name = properties.get('parent_name')
+            parent_spec = '{name}@{snap}'.format(name=parent_name, snap=name)
+
+            (remote,) = ctx.cluster.only(role).remotes.keys()
+
+            for cmd in [('rm', name),
+                        ('snap', 'unprotect', parent_spec),
+                        ('snap', 'rm', parent_spec)]:
+                args = [
+                        'adjust-ulimits',
+                        'ceph-coverage',
+                        '{tdir}/archive/coverage'.format(tdir=testdir),
+                        'rbd', '-p', 'rbd'
+                        ]
+                args.extend(cmd)
+                remote.run(args=args)
+
+@contextlib.contextmanager
+def modprobe(ctx, config):
+    """
+    Load the rbd kernel module..
+
+    For example::
+
+        tasks:
+        - ceph:
+        - rbd.create_image: [client.0]
+        - rbd.modprobe: [client.0]
+    """
+    log.info('Loading rbd kernel module...')
+    for role in config:
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+        remote.run(
+            args=[
+                'sudo',
+                'modprobe',
+                'rbd',
+                ],
+            )
+    try:
+        yield
+    finally:
+        log.info('Unloading rbd kernel module...')
+        for role in config:
+            (remote,) = ctx.cluster.only(role).remotes.keys()
+            remote.run(
+                args=[
+                    'sudo',
+                    'modprobe',
+                    '-r',
+                    'rbd',
+                    # force errors to be ignored; necessary if more
+                    # than one device was created, which may mean
+                    # the module isn't quite ready to go the first
+                    # time through.
+                    run.Raw('||'),
+                    'true',
+                    ],
+                )
+
+@contextlib.contextmanager
+def dev_create(ctx, config):
+    """
+    Map block devices to rbd images.
+
+    For example::
+
+        tasks:
+        - ceph:
+        - rbd.create_image: [client.0]
+        - rbd.modprobe: [client.0]
+        - rbd.dev_create:
+            client.0:
+                image_name: testimage.client.0
+                encryption_format: luks2
+    """
+    assert isinstance(config, dict) or isinstance(config, list), \
+        "task dev_create only supports a list or dictionary for configuration"
+
+    if isinstance(config, dict):
+        images = config.items()
+    else:
+        images = [(role, None) for role in config]
+
+    log.info('Creating rbd block devices...')
+
+    testdir = teuthology.get_testdir(ctx)
+    passphrase_file = '{tdir}/passphrase'.format(tdir=testdir)
+    device_path = {}
+
+    for role, properties in images:
+        if properties is None:
+            properties = {}
+        name = properties.get('image_name', default_image_name(role))
+        encryption_format = properties.get('encryption_format', 'none')
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+
+        if encryption_format == 'none':
+            device_path[role] = '/dev/rbd/rbd/{image}'.format(image=name)
+            device_specific_args = []
+        else:
+            remote.run(
+                args=[
+                    'echo',
+                    ENCRYPTION_PASSPHRASE,
+                    run.Raw('>'),
+                    passphrase_file
+                    ]
+                )
+            device_specific_args = [
+                '-t', 'nbd', '-o',
+                'encryption-format=%s,encryption-passphrase-file=%s' % (
+                    encryption_format, passphrase_file)]
+
+        map_fp = StringIO()
+        remote.run(
+            args=[
+                'sudo',
+                'adjust-ulimits',
+                'ceph-coverage',
+                '{tdir}/archive/coverage'.format(tdir=testdir),
+                'rbd',
+                '--id', role.rsplit('.')[-1],
+                '-p', 'rbd',
+                'map',
+                name] + device_specific_args,
+            stdout=map_fp,
+            )
+
+        if encryption_format != 'none':
+            device_path[role] = map_fp.getvalue().rstrip()
+            properties['device_path'] = device_path[role]
+            remote.run(args=['sudo', 'chmod', '666', device_path[role]])
+    try:
+        yield
+    finally:
+        log.info('Unmapping rbd devices...')
+        remote.run(args=['rm', '-f', passphrase_file])
+        for role, properties in images:
+            if not device_path.get(role):
+                continue
+
+            if properties is None:
+                properties = {}
+            encryption_format = properties.get('encryption_format', 'none')
+            (remote,) = ctx.cluster.only(role).remotes.keys()
+
+            if encryption_format == 'none':
+                device_specific_args = []
+            else:
+                device_specific_args = ['-t', 'nbd']
+
+            remote.run(
+                args=[
+                    'LD_LIBRARY_PATH={tdir}/binary/usr/local/lib'.format(tdir=testdir),
+                    'sudo',
+                    'adjust-ulimits',
+                    'ceph-coverage',
+                    '{tdir}/archive/coverage'.format(tdir=testdir),
+                    'rbd',
+                    '-p', 'rbd',
+                    'unmap',
+                    device_path[role],
+                    ] + device_specific_args,
+                )
+
+
+def rbd_devname_rtn(ctx, image):
+    return '/dev/rbd/rbd/{image}'.format(image=image)    
+
+def canonical_path(ctx, role, path):
+    """
+    Determine the canonical path for a given path on the host
+    representing the given role.  A canonical path contains no
+    . or .. components, and includes no symbolic links.
+    """
+    version_fp = StringIO()
+    ctx.cluster.only(role).run(
+        args=[ 'readlink', '-f', path ],
+        stdout=version_fp,
+        )
+    canonical_path = version_fp.getvalue().rstrip('\n')
+    version_fp.close()
+    return canonical_path
+
+@contextlib.contextmanager
+def run_xfstests(ctx, config):
+    """
+    Run xfstests over specified devices.
+
+    Warning: both the test and scratch devices specified will be
+    overwritten.  Normally xfstests modifies (but does not destroy)
+    the test device, but for now the run script used here re-makes
+    both filesystems.
+
+    Note: Only one instance of xfstests can run on a single host at
+    a time, although this is not enforced.
+
+    This task in its current form needs some improvement.  For
+    example, it assumes all roles provided in the config are
+    clients, and that the config provided is a list of key/value
+    pairs.  For now please use the xfstests() interface, below.
+
+    For example::
+
+        tasks:
+        - ceph:
+        - rbd.run_xfstests:
+            client.0:
+                count: 2
+                test_dev: 'test_dev'
+                scratch_dev: 'scratch_dev'
+                fs_type: 'xfs'
+                tests: 'generic/100 xfs/003 xfs/005 xfs/006 generic/015'
+                exclude:
+                - generic/42
+                randomize: true
+    """
+    with parallel() as p:
+        for role, properties in config.items():
+            p.spawn(run_xfstests_one_client, ctx, role, properties)
+        exc = None
+        while True:
+            try:
+                p.next()
+            except StopIteration:
+                break
+            except:
+                exc = sys.exc_info()[1]
+        if exc is not None:
+            raise exc
+    yield
+
+def run_xfstests_one_client(ctx, role, properties):
+    """
+    Spawned routine to handle xfs tests for a single client
+    """
+    testdir = teuthology.get_testdir(ctx)
+    try:
+        count = properties.get('count')
+        test_dev = properties.get('test_dev')
+        assert test_dev is not None, \
+            "task run_xfstests requires test_dev to be defined"
+        test_dev = canonical_path(ctx, role, test_dev)
+
+        scratch_dev = properties.get('scratch_dev')
+        assert scratch_dev is not None, \
+            "task run_xfstests requires scratch_dev to be defined"
+        scratch_dev = canonical_path(ctx, role, scratch_dev)
+
+        fs_type = properties.get('fs_type')
+        tests = properties.get('tests')
+        exclude_list = properties.get('exclude')
+        randomize = properties.get('randomize')
+
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+
+        # Fetch the test script
+        test_root = teuthology.get_testdir(ctx)
+        test_script = 'run_xfstests.sh'
+        test_path = os.path.join(test_root, test_script)
+
+        xfstests_url = properties.get('xfstests_url')
+        assert xfstests_url is not None, \
+            "task run_xfstests requires xfstests_url to be defined"
+
+        xfstests_krbd_url = xfstests_url + '/' + test_script
+
+        log.info('Fetching {script} for {role} from {url}'.format(
+            script=test_script,
+            role=role,
+            url=xfstests_krbd_url))
+
+        args = [ 'wget', '-O', test_path, '--', xfstests_krbd_url ]
+        remote.run(args=args)
+
+        log.info('Running xfstests on {role}:'.format(role=role))
+        log.info('   iteration count: {count}:'.format(count=count))
+        log.info('       test device: {dev}'.format(dev=test_dev))
+        log.info('    scratch device: {dev}'.format(dev=scratch_dev))
+        log.info('     using fs_type: {fs_type}'.format(fs_type=fs_type))
+        log.info('      tests to run: {tests}'.format(tests=tests))
+        log.info('      exclude list: {}'.format(' '.join(exclude_list)))
+        log.info('         randomize: {randomize}'.format(randomize=randomize))
+
+        if exclude_list:
+            with tempfile.NamedTemporaryFile(mode='w', prefix='exclude') as exclude_file:
+                for test in exclude_list:
+                    exclude_file.write("{}\n".format(test))
+                exclude_file.flush()
+                remote.put_file(exclude_file.name, exclude_file.name)
+
+        # Note that the device paths are interpreted using
+        # readlink -f <path> in order to get their canonical
+        # pathname (so it matches what the kernel remembers).
+        args = [
+            '/usr/bin/sudo',
+            'TESTDIR={tdir}'.format(tdir=testdir),
+            'adjust-ulimits',
+            'ceph-coverage',
+            '{tdir}/archive/coverage'.format(tdir=testdir),
+            '/bin/bash',
+            test_path,
+            '-c', str(count),
+            '-f', fs_type,
+            '-t', test_dev,
+            '-s', scratch_dev,
+            ]
+        if exclude_list:
+            args.extend(['-x', exclude_file.name])
+        if randomize:
+            args.append('-r')
+        if tests:
+            args.extend(['--', tests])
+        remote.run(args=args, logger=log.getChild(role))
+    finally:
+        log.info('Removing {script} on {role}'.format(script=test_script,
+                                                      role=role))
+        remote.run(args=['rm', '-f', test_path])
+
+@contextlib.contextmanager
+def xfstests(ctx, config):
+    """
+    Run xfstests over rbd devices.  This interface sets up all
+    required configuration automatically if not otherwise specified.
+    Note that only one instance of xfstests can run on a single host
+    at a time.  By default, the set of tests specified is run once.
+    If a (non-zero) count value is supplied, the complete set of
+    tests will be run that number of times.
+
+    For example::
+
+        tasks:
+        - ceph:
+        # Image sizes are in MB
+        - rbd.xfstests:
+            client.0:
+                count: 3
+                test_image: 'test_image'
+                test_size: 250
+                test_format: 2
+                scratch_image: 'scratch_image'
+                scratch_size: 250
+                scratch_format: 1
+                fs_type: 'xfs'
+                tests: 'generic/100 xfs/003 xfs/005 xfs/006 generic/015'
+                exclude:
+                - generic/42
+                randomize: true
+                xfstests_url: 'https://raw.github.com/ceph/ceph-ci/wip-55555/qa'
+    """
+    if config is None:
+        config = { 'all': None }
+    assert isinstance(config, dict) or isinstance(config, list), \
+        "task xfstests only supports a list or dictionary for configuration"
+    if isinstance(config, dict):
+        config = teuthology.replace_all_with_clients(ctx.cluster, config)
+        runs = config.items()
+    else:
+        runs = [(role, None) for role in config]
+
+    running_xfstests = {}
+    for role, properties in runs:
+        assert role.startswith('client.'), \
+            "task xfstests can only run on client nodes"
+        for host, roles_for_host in ctx.cluster.remotes.items():
+            if role in roles_for_host:
+                assert host not in running_xfstests, \
+                    "task xfstests allows only one instance at a time per host"
+                running_xfstests[host] = True
+
+    images_config = {}
+    scratch_config = {}
+    modprobe_config = {}
+    image_map_config = {}
+    scratch_map_config = {}
+    xfstests_config = {}
+    for role, properties in runs:
+        if properties is None:
+            properties = {}
+
+        test_image = properties.get('test_image', 'test_image.{role}'.format(role=role))
+        test_size = properties.get('test_size', 10000) # 10G
+        test_fmt = properties.get('test_format', 1)
+        scratch_image = properties.get('scratch_image', 'scratch_image.{role}'.format(role=role))
+        scratch_size = properties.get('scratch_size', 10000) # 10G
+        scratch_fmt = properties.get('scratch_format', 1)
+
+        images_config[role] = dict(
+            image_name=test_image,
+            image_size=test_size,
+            image_format=test_fmt,
+            )
+
+        scratch_config[role] = dict(
+            image_name=scratch_image,
+            image_size=scratch_size,
+            image_format=scratch_fmt,
+            )
+
+        xfstests_branch = properties.get('xfstests_branch', 'master')
+        xfstests_url = properties.get('xfstests_url', 'https://raw.github.com/ceph/ceph/{branch}/qa'.format(branch=xfstests_branch))
+
+        xfstests_config[role] = dict(
+            count=properties.get('count', 1),
+            test_dev='/dev/rbd/rbd/{image}'.format(image=test_image),
+            scratch_dev='/dev/rbd/rbd/{image}'.format(image=scratch_image),
+            fs_type=properties.get('fs_type', 'xfs'),
+            randomize=properties.get('randomize', False),
+            tests=properties.get('tests'),
+            exclude=properties.get('exclude', []),
+            xfstests_url=xfstests_url,
+            )
+
+        log.info('Setting up xfstests using RBD images:')
+        log.info('      test ({size} MB): {image}'.format(size=test_size,
+                                                        image=test_image))
+        log.info('   scratch ({size} MB): {image}'.format(size=scratch_size,
+                                                        image=scratch_image))
+        modprobe_config[role] = None
+        image_map_config[role] = {'image_name': test_image}
+        scratch_map_config[role] = {'image_name': scratch_image}
+
+    with contextutil.nested(
+        lambda: create_image(ctx=ctx, config=images_config),
+        lambda: create_image(ctx=ctx, config=scratch_config),
+        lambda: modprobe(ctx=ctx, config=modprobe_config),
+        lambda: dev_create(ctx=ctx, config=image_map_config),
+        lambda: dev_create(ctx=ctx, config=scratch_map_config),
+        lambda: run_xfstests(ctx=ctx, config=xfstests_config),
+        ):
+        yield
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Create and mount an rbd image.
+
+    For example, you can specify which clients to run on::
+
+        tasks:
+        - ceph:
+        - rbd: [client.0, client.1]
+
+    There are a few image options::
+
+        tasks:
+        - ceph:
+        - rbd:
+            client.0: # uses defaults
+            client.1:
+                image_name: foo
+                image_size: 2048
+                image_format: 2
+                fs_type: xfs
+
+    To use default options on all clients::
+
+        tasks:
+        - ceph:
+        - rbd:
+            all:
+
+    To create 20GiB images and format them with xfs on all clients::
+
+        tasks:
+        - ceph:
+        - rbd:
+            all:
+              image_size: 20480
+              fs_type: xfs
+    """
+    if config is None:
+        config = { 'all': None }
+    norm_config = config
+    if isinstance(config, dict):
+        norm_config = teuthology.replace_all_with_clients(ctx.cluster, config)
+    if isinstance(norm_config, dict):
+        role_images = {}
+        for role, properties in norm_config.items():
+            if properties is None:
+                properties = {}
+            role_images[role] = properties.get('image_name')
+    else:
+        role_images = norm_config
+
+    log.debug('rbd config is: %s', norm_config)
+
+    with contextutil.nested(
+        lambda: create_image(ctx=ctx, config=norm_config),
+        lambda: modprobe(ctx=ctx, config=norm_config),
+        lambda: dev_create(ctx=ctx, config=norm_config),
+        lambda: generic_mkfs(ctx=ctx, config=norm_config,
+                devname_rtn=rbd_devname_rtn),
+        lambda: generic_mount(ctx=ctx, config=role_images,
+                devname_rtn=rbd_devname_rtn),
+        ):
+        yield
diff --git a/qa/tasks/rbd_fio.py b/qa/tasks/rbd_fio.py
new file mode 100644
index 000000000..959d07d49
--- /dev/null
+++ b/qa/tasks/rbd_fio.py
@@ -0,0 +1,225 @@
+"""
+ Long running fio tests on rbd mapped devices for format/features provided in config
+ Many fio parameters can be configured so that this task can be used along with thrash/power-cut tests
+ and exercise IO on full disk for all format/features
+  - This test should not be run on VM due to heavy use of resource
+
+"""
+import contextlib
+import json
+import logging
+import os
+
+from teuthology.parallel import parallel
+from teuthology import misc as teuthology
+from tempfile import NamedTemporaryFile
+from teuthology.orchestra import run
+from teuthology.packaging import install_package, remove_package
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    client.0:
+       fio-io-size: 100g or 80% or 100m
+       fio-version: 2.2.9
+       formats: [2]
+       features: [[layering],[striping],[layering,exclusive-lock,object-map]]
+       test-clone-io: 1  #remove this option to not run create rbd clone and not run io on clone
+       io-engine: "sync or rbd or any io-engine"
+       rw: randrw
+    client.1:
+       fio-io-size: 100g
+       fio-version: 2.2.9
+       rw: read
+       image-size:20480
+
+or
+    all:
+       fio-io-size: 400g
+       rw: randrw
+       formats: [2]
+       features: [[layering],[striping]]
+       io-engine: libaio
+
+    Create rbd image + device and exercise IO for format/features provided in config file
+    Config can be per client or one config can be used for all clients, fio jobs are run in parallel for client provided
+
+    """
+    if config.get('all'):
+        client_config = config['all']
+    clients = ctx.cluster.only(teuthology.is_type('client'))
+    rbd_test_dir = teuthology.get_testdir(ctx) + "/rbd_fio_test"
+    for remote,role in clients.remotes.items():
+        if 'client_config' in locals():
+           with parallel() as p:
+               p.spawn(run_fio, remote, client_config, rbd_test_dir)
+        else:
+           for client_config in config:
+              if client_config in role:
+                 with parallel() as p:
+                     p.spawn(run_fio, remote, config[client_config], rbd_test_dir)
+
+    yield
+
+
+def get_ioengine_package_name(ioengine, remote):
+    system_type = teuthology.get_system_type(remote)
+    if ioengine == 'rbd':
+        return 'librbd1-devel' if system_type == 'rpm' else 'librbd-dev'
+    elif ioengine == 'libaio':
+        return 'libaio-devel' if system_type == 'rpm' else 'libaio-dev'
+    else:
+        return None
+
+
+def run_rbd_map(remote, image, iodepth):
+    iodepth = max(iodepth, 128)  # RBD_QUEUE_DEPTH_DEFAULT
+    dev = remote.sh(['sudo', 'rbd', 'device', 'map', '-o',
+                     'queue_depth={}'.format(iodepth), image]).rstrip('\n')
+    remote.sudo_write_file(
+        '/sys/block/{}/queue/nr_requests'.format(os.path.basename(dev)),
+        str(iodepth))
+    return dev
+
+
+def run_fio(remote, config, rbd_test_dir):
+    """
+    create fio config file with options based on above config
+    get the fio from github, generate binary, and use it to run on
+    the generated fio config file
+    """
+    fio_config=NamedTemporaryFile(mode='w', prefix='fio_rbd_', dir='/tmp/', delete=False)
+    fio_config.write('[global]\n')
+    if config.get('io-engine'):
+        ioengine=config['io-engine']
+        fio_config.write('ioengine={ioe}\n'.format(ioe=ioengine))
+    else:
+        fio_config.write('ioengine=sync\n')
+    if config.get('bs'):
+        bs=config['bs']
+        fio_config.write('bs={bs}\n'.format(bs=bs))
+    else:
+        fio_config.write('bs=4k\n')
+    iodepth = config.get('io-depth', 2)
+    fio_config.write('iodepth={iod}\n'.format(iod=iodepth))
+    if config.get('fio-io-size'):
+        size=config['fio-io-size']
+        fio_config.write('size={size}\n'.format(size=size))
+    else:
+        fio_config.write('size=100m\n')
+
+    fio_config.write('time_based\n')
+    if config.get('runtime'):
+        runtime=config['runtime']
+        fio_config.write('runtime={runtime}\n'.format(runtime=runtime))
+    else:
+        fio_config.write('runtime=1800\n')
+    fio_config.write('allow_file_create=0\n')
+    image_size=10240
+    if config.get('image_size'):
+        image_size=config['image_size']
+
+    formats=[1,2]
+    features=[['layering'],['striping'],['exclusive-lock','object-map']]
+    fio_version='3.32'
+    if config.get('formats'):
+        formats=config['formats']
+    if config.get('features'):
+        features=config['features']
+    if config.get('fio-version'):
+        fio_version=config['fio-version']
+
+    # handle package required for ioengine, if any
+    sn=remote.shortname
+    ioengine_pkg = get_ioengine_package_name(ioengine, remote)
+    if ioengine_pkg:
+        install_package(ioengine_pkg, remote)
+
+    fio_config.write('norandommap\n')
+    if ioengine == 'rbd':
+        fio_config.write('clientname=admin\n')
+        fio_config.write('pool=rbd\n')
+        fio_config.write('invalidate=0\n')
+    elif ioengine == 'libaio':
+        fio_config.write('direct=1\n')
+    for frmt in formats:
+        for feature in features:
+           log.info("Creating rbd images on {sn}".format(sn=sn))
+           feature_name = '-'.join(feature)
+           rbd_name = 'i{i}f{f}{sn}'.format(i=frmt,f=feature_name,sn=sn)
+           rbd_snap_name = 'i{i}f{f}{sn}@i{i}f{f}{sn}Snap'.format(i=frmt,f=feature_name,sn=sn)
+           rbd_clone_name = 'i{i}f{f}{sn}Clone'.format(i=frmt,f=feature_name,sn=sn)
+           create_args=['rbd', 'create',
+                        '--size', '{size}'.format(size=image_size),
+                        '--image', rbd_name,
+                        '--image-format', '{f}'.format(f=frmt)]
+           map(lambda x: create_args.extend(['--image-feature', x]), feature)
+           if config.get('thick-provision'):
+               create_args.append('--thick-provision')
+           remote.run(args=create_args)
+           remote.run(args=['rbd', 'info', rbd_name])
+           if ioengine != 'rbd':
+               rbd_dev = run_rbd_map(remote, rbd_name, iodepth)
+               if config.get('test-clone-io'):
+                    log.info("Testing clones using fio")
+                    remote.run(args=['rbd', 'snap', 'create', rbd_snap_name])
+                    remote.run(args=['rbd', 'snap', 'protect', rbd_snap_name])
+                    remote.run(args=['rbd', 'clone', rbd_snap_name, rbd_clone_name])
+                    rbd_clone_dev = run_rbd_map(remote, rbd_clone_name, iodepth)
+               fio_config.write('[{rbd_dev}]\n'.format(rbd_dev=rbd_dev))
+               if config.get('rw'):
+                   rw=config['rw']
+                   fio_config.write('rw={rw}\n'.format(rw=rw))
+               else:
+                   fio_config .write('rw=randrw\n')
+               fio_config.write('filename={rbd_dev}\n'.format(rbd_dev=rbd_dev))
+               if config.get('test-clone-io'):
+                   fio_config.write('[{rbd_clone_dev}]\n'.format(rbd_clone_dev=rbd_clone_dev))
+                   fio_config.write('rw={rw}\n'.format(rw=rw))
+                   fio_config.write('filename={rbd_clone_dev}\n'.format(rbd_clone_dev=rbd_clone_dev))
+           else:
+               if config.get('test-clone-io'):
+                    log.info("Testing clones using fio")
+                    remote.run(args=['rbd', 'snap', 'create', rbd_snap_name])
+                    remote.run(args=['rbd', 'snap', 'protect', rbd_snap_name])
+                    remote.run(args=['rbd', 'clone', rbd_snap_name, rbd_clone_name])
+               fio_config.write('[{img_name}]\n'.format(img_name=rbd_name))
+               if config.get('rw'):
+                   rw=config['rw']
+                   fio_config.write('rw={rw}\n'.format(rw=rw))
+               else:
+                   fio_config.write('rw=randrw\n')
+               fio_config.write('rbdname={img_name}\n'.format(img_name=rbd_name))
+               if config.get('test-clone-io'):
+                   fio_config.write('[{clone_img_name}]\n'.format(clone_img_name=rbd_clone_name))
+                   fio_config.write('rw={rw}\n'.format(rw=rw))
+                   fio_config.write('rbdname={clone_img_name}\n'.format(clone_img_name=rbd_clone_name))
+
+
+    fio_config.close()
+    remote.put_file(fio_config.name,fio_config.name)
+    try:
+        log.info("Running rbd feature - fio test on {sn}".format(sn=sn))
+        fio = "https://github.com/axboe/fio/archive/fio-" + fio_version + ".tar.gz"
+        remote.run(args=['mkdir', run.Raw(rbd_test_dir),])
+        remote.run(args=['cd' , run.Raw(rbd_test_dir),
+                         run.Raw(';'), 'wget', fio, run.Raw(';'), run.Raw('tar -xvf fio*tar.gz'), run.Raw(';'),
+                         run.Raw('cd fio-fio*'), run.Raw(';'), './configure', run.Raw(';'), 'make'])
+        remote.run(args=['ceph', '-s'])
+        remote.run(args=[run.Raw('{tdir}/fio-fio-{v}/fio --showcmd {f}'.format(tdir=rbd_test_dir,v=fio_version,f=fio_config.name))])
+        remote.run(args=['sudo', run.Raw('{tdir}/fio-fio-{v}/fio {f}'.format(tdir=rbd_test_dir,v=fio_version,f=fio_config.name))])
+        remote.run(args=['ceph', '-s'])
+    finally:
+        out = remote.sh('rbd device list --format=json')
+        mapped_images = json.loads(out)
+        if mapped_images:
+            log.info("Unmapping rbd images on {sn}".format(sn=sn))
+            for image in mapped_images:
+                remote.run(args=['sudo', 'rbd', 'device', 'unmap',
+                                 str(image['device'])])
+        log.info("Cleaning up fio install")
+        remote.run(args=['rm','-rf', run.Raw(rbd_test_dir)])
+        if ioengine_pkg:
+            remove_package(ioengine_pkg, remote)
diff --git a/qa/tasks/rbd_fsx.py b/qa/tasks/rbd_fsx.py
new file mode 100644
index 000000000..efea7208e
--- /dev/null
+++ b/qa/tasks/rbd_fsx.py
@@ -0,0 +1,115 @@
+"""
+Run fsx on an rbd image
+"""
+import contextlib
+import logging
+
+from teuthology.exceptions import ConfigError
+from teuthology.parallel import parallel
+from teuthology import misc as teuthology
+from tasks.ceph_manager import get_valgrind_args
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run fsx on an rbd image.
+
+    Currently this requires running as client.admin
+    to create a pool.
+
+    Specify which clients to run on as a list::
+
+      tasks:
+        ceph:
+        rbd_fsx:
+          clients: [client.0, client.1]
+
+    You can optionally change some properties of fsx:
+
+      tasks:
+        ceph:
+        rbd_fsx:
+          clients: <list of clients>
+          seed: <random seed number, or 0 to use the time>
+          ops: <number of operations to do>
+          size: <maximum image size in bytes>
+          valgrind: [--tool=<valgrind tool>]
+    """
+    log.info('starting rbd_fsx...')
+    with parallel() as p:
+        for role in config['clients']:
+            p.spawn(_run_one_client, ctx, config, role)
+    yield
+
+def _run_one_client(ctx, config, role):
+    """Spawned task that runs the client"""
+    krbd = config.get('krbd', False)
+    nbd = config.get('nbd', False)
+    testdir = teuthology.get_testdir(ctx)
+    (remote,) = ctx.cluster.only(role).remotes.keys()
+
+    args = []
+    if krbd or nbd:
+        args.append('sudo') # rbd(-nbd) map/unmap need privileges
+    args.extend([
+        'adjust-ulimits',
+        'ceph-coverage',
+        '{tdir}/archive/coverage'.format(tdir=testdir)
+    ])
+
+    overrides = ctx.config.get('overrides', {})
+    teuthology.deep_merge(config, overrides.get('rbd_fsx', {}))
+
+    if config.get('valgrind'):
+        args = get_valgrind_args(
+            testdir,
+            'fsx_{id}'.format(id=role),
+            args,
+            config.get('valgrind')
+        )
+
+    cluster_name, type_, client_id = teuthology.split_role(role)
+    if type_ != 'client':
+        msg = 'client role ({0}) must be a client'.format(role)
+        raise ConfigError(msg)
+
+    args.extend([
+        'ceph_test_librbd_fsx',
+        '--cluster', cluster_name,
+        '--id', client_id,
+        '-d', # debug output for all operations
+        '-W', '-R', # mmap doesn't work with rbd
+        '-p', str(config.get('progress_interval', 100)), # show progress
+        '-P', '{tdir}/archive'.format(tdir=testdir),
+        '-r', str(config.get('readbdy',1)),
+        '-w', str(config.get('writebdy',1)),
+        '-t', str(config.get('truncbdy',1)),
+        '-h', str(config.get('holebdy',1)),
+        '-l', str(config.get('size', 250000000)),
+        '-S', str(config.get('seed', 0)),
+        '-N', str(config.get('ops', 1000)),
+    ])
+    if krbd:
+        args.append('-K') # -K enables krbd mode
+    if nbd:
+        args.append('-M') # -M enables nbd mode
+    if config.get('direct_io', False):
+        args.append('-Z') # -Z use direct IO
+    if not config.get('randomized_striping', True):
+        args.append('-U') # -U disables randomized striping
+    if not config.get('punch_holes', True):
+        args.append('-H') # -H disables discard ops
+    if config.get('deep_copy', False):
+        args.append('-g') # -g deep copy instead of clone
+    if config.get('journal_replay', False):
+        args.append('-j') # -j replay all IO events from journal
+    if config.get('keep_images', False):
+        args.append('-k') # -k keep images on success
+    args.extend([
+        config.get('pool_name', 'pool_{pool}'.format(pool=role)),
+        'image_{image}'.format(image=role),
+    ])
+
+    remote.run(args=args)
diff --git a/qa/tasks/rbd_mirror.py b/qa/tasks/rbd_mirror.py
new file mode 100644
index 000000000..5da252560
--- /dev/null
+++ b/qa/tasks/rbd_mirror.py
@@ -0,0 +1,120 @@
+"""
+Task for running rbd mirroring daemons and configuring mirroring
+"""
+
+import logging
+
+from teuthology.orchestra import run
+from teuthology import misc
+from teuthology.exceptions import ConfigError
+from teuthology.task import Task
+from tasks.ceph_manager import get_valgrind_args
+from tasks.util import get_remote_for_role
+
+log = logging.getLogger(__name__)
+
+
+class RBDMirror(Task):
+    """
+    Run an rbd-mirror daemon to sync rbd images between clusters.
+
+    This requires two clients (one from each cluster) on the same host
+    to connect with. The pool configuration should be adjusted by later
+    test scripts to include the remote client and cluster name. This task
+    just needs to know how to connect to the local cluster.
+
+    For example:
+
+        roles:
+        - [primary.mon.a, primary.osd.0, primary.osd.1, primary.osd.2]
+        - [secondary.mon.a, secondary.osd.0, secondary.osd.1, secondary.osd.2]
+        - [primary.client.mirror, secondary.client.mirror]
+        tasks:
+        - ceph:
+            cluster: primary
+        - ceph:
+            cluster: secondary
+        - rbd-mirror:
+            client: primary.client.mirror
+
+    To mirror back to the primary cluster as well, add another
+    rbd_mirror instance:
+
+        - rbd-mirror:
+            client: secondary.client.mirror
+
+    Possible options for this task are:
+
+        client: role - ceph client to connect as
+        valgrind: [--tool=<valgrind tool>] - none by default
+        coverage: bool - whether this run may be collecting coverage data
+        thrash: bool - whether this run may be thrashed
+    """
+    def __init__(self, ctx, config):
+        super(RBDMirror, self).__init__(ctx, config)
+        self.log = log
+
+    def setup(self):
+        super(RBDMirror, self).setup()
+        try:
+            self.client = self.config['client']
+        except KeyError:
+            raise ConfigError('rbd-mirror requires a client to connect with')
+
+        self.cluster_name, type_, self.client_id = misc.split_role(self.client)
+
+        if type_ != 'client':
+            msg = 'client role ({0}) must be a client'.format(self.client)
+            raise ConfigError(msg)
+
+        self.remote = get_remote_for_role(self.ctx, self.client)
+
+    def begin(self):
+        super(RBDMirror, self).begin()
+        testdir = misc.get_testdir(self.ctx)
+        daemon_signal = 'kill'
+        if 'coverage' in self.config or 'valgrind' in self.config or \
+                self.config.get('thrash', False):
+            daemon_signal = 'term'
+
+        args = [
+            'adjust-ulimits',
+            'ceph-coverage',
+            '{tdir}/archive/coverage'.format(tdir=testdir),
+            'daemon-helper',
+            daemon_signal,
+            ]
+
+        if 'valgrind' in self.config:
+            args = get_valgrind_args(
+                testdir,
+                'rbd-mirror-{id}'.format(id=self.client),
+                args,
+                self.config.get('valgrind')
+            )
+
+        args.extend([
+            'rbd-mirror', '--foreground',
+            '--cluster',
+            self.cluster_name,
+            '--id',
+            self.client_id,
+            ])
+
+        self.ctx.daemons.add_daemon(
+            self.remote, 'rbd-mirror', self.client,
+            cluster=self.cluster_name,
+            args=args,
+            logger=self.log.getChild(self.client),
+            stdin=run.PIPE,
+            wait=False,
+        )
+
+    def end(self):
+        mirror_daemon = self.ctx.daemons.get_daemon('rbd-mirror',
+                                                    self.client,
+                                                    self.cluster_name)
+        mirror_daemon.stop()
+        super(RBDMirror, self).end()
+
+task = RBDMirror
diff --git a/qa/tasks/rbd_mirror_thrash.py b/qa/tasks/rbd_mirror_thrash.py
new file mode 100644
index 000000000..a42d19e70
--- /dev/null
+++ b/qa/tasks/rbd_mirror_thrash.py
@@ -0,0 +1,218 @@
+"""
+Task for thrashing rbd-mirror daemons
+"""
+
+import contextlib
+import logging
+import random
+import signal
+import socket
+import time
+
+from gevent import sleep
+from gevent.greenlet import Greenlet
+from gevent.event import Event
+
+from teuthology.exceptions import CommandFailedError
+from teuthology.orchestra import run
+from tasks.thrasher import Thrasher
+
+log = logging.getLogger(__name__)
+
+
+class RBDMirrorThrasher(Thrasher, Greenlet):
+    """
+    RBDMirrorThrasher::
+
+    The RBDMirrorThrasher thrashes rbd-mirror daemons during execution of other
+    tasks (workunits, etc).
+
+    The config is optional.  Many of the config parameters are a maximum value
+    to use when selecting a random value from a range.  The config is a dict
+    containing some or all of:
+
+    cluster: [default: ceph] cluster to thrash
+
+    max_thrash: [default: 1] the maximum number of active rbd-mirror daemons per
+      cluster will be thrashed at any given time.
+
+    min_thrash_delay: [default: 60] minimum number of seconds to delay before
+      thrashing again.
+
+    max_thrash_delay: [default: 120] maximum number of seconds to delay before
+      thrashing again.
+
+    max_revive_delay: [default: 10] maximum number of seconds to delay before
+      bringing back a thrashed rbd-mirror daemon.
+
+    randomize: [default: true] enables randomization and use the max/min values
+
+    seed: [no default] seed the random number generator
+
+    Examples::
+
+      The following example disables randomization, and uses the max delay
+      values:
+
+      tasks:
+      - ceph:
+      - rbd_mirror_thrash:
+          randomize: False
+          max_thrash_delay: 10
+    """
+
+    def __init__(self, ctx, config, cluster, daemons):
+        super(RBDMirrorThrasher, self).__init__()
+
+        self.ctx = ctx
+        self.config = config
+        self.cluster = cluster
+        self.daemons = daemons
+
+        self.logger = log
+        self.name = 'thrasher.rbd_mirror.[{cluster}]'.format(cluster = cluster)
+        self.stopping = Event()
+
+        self.randomize = bool(self.config.get('randomize', True))
+        self.max_thrash = int(self.config.get('max_thrash', 1))
+        self.min_thrash_delay = float(self.config.get('min_thrash_delay', 60.0))
+        self.max_thrash_delay = float(self.config.get('max_thrash_delay', 120.0))
+        self.max_revive_delay = float(self.config.get('max_revive_delay', 10.0))
+
+    def _run(self):
+        try:
+            self.do_thrash()
+        except Exception as e:
+            # See _run exception comment for MDSThrasher
+            self.set_thrasher_exception(e)
+            self.logger.exception("exception:")
+            # Allow successful completion so gevent doesn't see an exception.
+            # The DaemonWatchdog will observe the error and tear down the test.
+
+    def log(self, x):
+        """Write data to logger assigned to this RBDMirrorThrasher"""
+        self.logger.info(x)
+
+    def stop(self):
+        self.stopping.set()
+
+    def do_thrash(self):
+        """
+        Perform the random thrashing action
+        """
+
+        self.log('starting thrash for cluster {cluster}'.format(cluster=self.cluster))
+        stats = {
+            "kill": 0,
+        }
+
+        while not self.stopping.is_set():
+            delay = self.max_thrash_delay
+            if self.randomize:
+                delay = random.randrange(self.min_thrash_delay, self.max_thrash_delay)
+
+            if delay > 0.0:
+                self.log('waiting for {delay} secs before thrashing'.format(delay=delay))
+                self.stopping.wait(delay)
+                if self.stopping.is_set():
+                    continue
+
+            killed_daemons = []
+
+            weight = 1.0 / len(self.daemons)
+            count = 0
+            for daemon in self.daemons:
+                skip = random.uniform(0.0, 1.0)
+                if weight <= skip:
+                    self.log('skipping daemon {label} with skip ({skip}) > weight ({weight})'.format(
+                        label=daemon.id_, skip=skip, weight=weight))
+                    continue
+
+                self.log('kill {label}'.format(label=daemon.id_))
+                try:
+                    daemon.signal(signal.SIGTERM)
+                except socket.error:
+                    pass
+                killed_daemons.append(daemon)
+                stats['kill'] += 1
+
+                # if we've reached max_thrash, we're done
+                count += 1
+                if count >= self.max_thrash:
+                    break
+
+            if killed_daemons:
+                # wait for a while before restarting
+                delay = self.max_revive_delay
+                if self.randomize:
+                    delay = random.randrange(0.0, self.max_revive_delay)
+
+                self.log('waiting for {delay} secs before reviving daemons'.format(delay=delay))
+                sleep(delay)
+
+                for daemon in killed_daemons:
+                    self.log('waiting for {label}'.format(label=daemon.id_))
+                    try:
+                        run.wait([daemon.proc], timeout=600)
+                    except CommandFailedError:
+                        pass
+                    except:
+                        self.log('Failed to stop {label}'.format(label=daemon.id_))
+
+                        try:
+                            # try to capture a core dump
+                            daemon.signal(signal.SIGABRT)
+                        except socket.error:
+                            pass
+                        raise
+                    finally:
+                        daemon.reset()
+
+                for daemon in killed_daemons:
+                    self.log('reviving {label}'.format(label=daemon.id_))
+                    daemon.start()
+
+        for stat in stats:
+            self.log("stat['{key}'] = {value}".format(key = stat, value = stats[stat]))
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Stress test the rbd-mirror by thrashing while another task/workunit
+    is running.
+
+    Please refer to RBDMirrorThrasher class for further information on the
+    available options.
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'rbd_mirror_thrash task only accepts a dict for configuration'
+
+    cluster = config.get('cluster', 'ceph')
+    daemons = list(ctx.daemons.iter_daemons_of_role('rbd-mirror', cluster))
+    assert len(daemons) > 0, \
+        'rbd_mirror_thrash task requires at least 1 rbd-mirror daemon'
+
+    # choose random seed
+    if 'seed' in config:
+        seed = int(config['seed'])
+    else:
+        seed = int(time.time())
+    log.info('rbd_mirror_thrash using random seed: {seed}'.format(seed=seed))
+    random.seed(seed)
+
+    thrasher = RBDMirrorThrasher(ctx, config, cluster, daemons)
+    thrasher.start()
+    ctx.ceph[cluster].thrashers.append(thrasher)
+
+    try:
+        log.debug('Yielding')
+        yield
+    finally:
+        log.info('joining rbd_mirror_thrash')
+        thrasher.stop()
+        if thrasher.exception is not None:
+            raise RuntimeError('error during thrashing')
+        thrasher.join()
+        log.info('done joining')
diff --git a/qa/tasks/rbd_pwl_cache_recovery.py b/qa/tasks/rbd_pwl_cache_recovery.py
new file mode 100644
index 000000000..e13c1f664
--- /dev/null
+++ b/qa/tasks/rbd_pwl_cache_recovery.py
@@ -0,0 +1,96 @@
+"""
+persistent write log cache recovery task
+"""
+import contextlib
+import logging
+import random
+import json
+import time
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+
+DEFAULT_NUM_ITERATIONS = 20
+IO_PATTERNS = ("full-seq", "rand")
+IO_SIZES = ('4K', '16K', '128K', '1024K')
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def thrashes_rbd_bench_on_persistent_cache(ctx, config):
+    """
+    thrashes rbd bench on persistent write log cache.
+    It can test recovery feature of persistent write log cache.
+    """
+    log.info("thrashes rbd bench on persistent write log cache")
+
+    client, client_config = list(config.items())[0]
+    (remote,) = ctx.cluster.only(client).remotes.keys()
+    client_config = client_config if client_config is not None else dict()
+    image_name = client_config.get('image_name', 'testimage')
+    num_iterations = client_config.get('num_iterations', DEFAULT_NUM_ITERATIONS)
+
+    for i in range(num_iterations):
+        log.info("start rbd bench")
+        # rbd bench could not specify the run time so set a large enough test size.
+        remote.run(
+            args=[
+                'rbd', 'bench',
+                '--io-type', 'write',
+                '--io-pattern', random.choice(IO_PATTERNS),
+                '--io-size', random.choice(IO_SIZES),
+                '--io-total', '100G',
+                image_name,
+                ],
+            wait=False,
+        )
+        # Wait a few seconds for the rbd bench process to run
+        # and complete the pwl cache initialization
+        time.sleep(10)
+        log.info("dump cache state when rbd bench running.")
+        remote.sh(['rbd', 'status', image_name, '--format=json'])
+        log.info("sleep...")
+        time.sleep(random.randint(10, 60))
+        log.info("rbd bench crash.")
+        remote.run(
+            args=[
+                'killall', '-9', 'rbd',
+                ],
+            check_status=False,
+        )
+        log.info("wait for watch timeout.")
+        time.sleep(40)
+        log.info("check cache state after crash.")
+        out = remote.sh(['rbd', 'status', image_name, '--format=json'])
+        rbd_status = json.loads(out)
+        assert len(rbd_status['watchers']) == 0
+        assert rbd_status['persistent_cache']['present'] == True
+        assert rbd_status['persistent_cache']['empty'] == False
+        assert rbd_status['persistent_cache']['clean'] == False
+        log.info("check dirty cache file.")
+        remote.run(
+            args=[
+                'test', '-e', rbd_status['persistent_cache']['path'],
+                ]
+        )
+    try:
+        yield
+    finally:
+        log.info("cleanup")
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    This is task for testing persistent write log cache recovery.
+    """
+    assert isinstance(config, dict), \
+            "task rbd_pwl_cache_recovery only supports a dictionary for configuration"
+
+    managers = []
+    config = teuthology.replace_all_with_clients(ctx.cluster, config)
+    managers.append(
+        lambda: thrashes_rbd_bench_on_persistent_cache(ctx=ctx, config=config)
+        )
+
+    with contextutil.nested(*managers):
+        yield
diff --git a/qa/tasks/rebuild_mondb.py b/qa/tasks/rebuild_mondb.py
new file mode 100644
index 000000000..008e312e2
--- /dev/null
+++ b/qa/tasks/rebuild_mondb.py
@@ -0,0 +1,224 @@
+"""
+Test if we can recover the leveldb from OSD after where all leveldbs are
+corrupted
+"""
+
+import logging
+import os.path
+import shutil
+import tempfile
+
+from tasks import ceph_manager
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+
+def _push_directory(path, remote, remote_dir):
+    """
+    local_temp_path=`mktemp`
+    tar czf $local_temp_path $path
+    ssh remote mkdir -p remote_dir
+    remote_temp_path=`mktemp`
+    scp $local_temp_path $remote_temp_path
+    rm $local_temp_path
+    tar xzf $remote_temp_path -C $remote_dir
+    ssh remote:$remote_temp_path
+    """
+    fd, local_temp_path = tempfile.mkstemp(suffix='.tgz',
+                                           prefix='rebuild_mondb-')
+    os.close(fd)
+    cmd = ' '.join(['tar', 'cz',
+                    '-f', local_temp_path,
+                    '-C', path,
+                    '--', '.'])
+    teuthology.sh(cmd)
+    _, fname = os.path.split(local_temp_path)
+    fd, remote_temp_path = tempfile.mkstemp(suffix='.tgz',
+                                            prefix='rebuild_mondb-')
+    os.close(fd)
+    remote.put_file(local_temp_path, remote_temp_path)
+    os.remove(local_temp_path)
+    remote.run(args=['sudo',
+                     'tar', 'xz',
+                     '-C', remote_dir,
+                     '-f', remote_temp_path])
+    remote.run(args=['sudo', 'rm', '-fr', remote_temp_path])
+
+
+def _nuke_mons(manager, mons, mon_id):
+    assert mons
+    is_mon = teuthology.is_type('mon')
+    for remote, roles in mons.remotes.items():
+        for role in roles:
+            if not is_mon(role):
+                continue
+            cluster, _, m = teuthology.split_role(role)
+            log.info('killing {cluster}:mon.{mon}'.format(
+                cluster=cluster,
+                mon=m))
+            manager.kill_mon(m)
+            mon_data = os.path.join('/var/lib/ceph/mon/',
+                                    '{0}-{1}'.format(cluster, m))
+            if m == mon_id:
+                # so we will only need to recreate the store.db for the
+                # first mon, would be easier than mkfs on it then replace
+                # the its store.db with the recovered one
+                store_dir = os.path.join(mon_data, 'store.db')
+                remote.run(args=['sudo', 'rm', '-r', store_dir])
+            else:
+                remote.run(args=['sudo', 'rm', '-r', mon_data])
+
+
+def _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path):
+    local_mstore = tempfile.mkdtemp()
+
+    # collect the maps from all OSDs
+    is_osd = teuthology.is_type('osd')
+    osds = ctx.cluster.only(is_osd)
+    assert osds
+    for osd, roles in osds.remotes.items():
+        for role in roles:
+            if not is_osd(role):
+                continue
+            cluster, _, osd_id = teuthology.split_role(role)
+            assert cluster_name == cluster
+            log.info('collecting maps from {cluster}:osd.{osd}'.format(
+                cluster=cluster,
+                osd=osd_id))
+            # push leveldb to OSD
+            osd_mstore = os.path.join(teuthology.get_testdir(ctx), 'mon-store')
+            osd.run(args=['sudo', 'mkdir', '-m', 'o+x', '-p', osd_mstore])
+
+            _push_directory(local_mstore, osd, osd_mstore)
+            log.info('rm -rf {0}'.format(local_mstore))
+            shutil.rmtree(local_mstore)
+            # update leveldb with OSD data
+            options = '--no-mon-config --op update-mon-db --mon-store-path {0}'
+            log.info('cot {0}'.format(osd_mstore))
+            manager.objectstore_tool(pool=None,
+                                     options=options.format(osd_mstore),
+                                     args='',
+                                     osd=osd_id,
+                                     do_revive=False)
+            # pull the updated mon db
+            log.info('pull dir {0} -> {1}'.format(osd_mstore, local_mstore))
+            local_mstore = tempfile.mkdtemp()
+            teuthology.pull_directory(osd, osd_mstore, local_mstore)
+            log.info('rm -rf osd:{0}'.format(osd_mstore))
+            osd.run(args=['sudo', 'rm', '-fr', osd_mstore])
+
+    # recover the first_mon with re-built mon db
+    # pull from recovered leveldb from client
+    mon_store_dir = os.path.join('/var/lib/ceph/mon',
+                                 '{0}-{1}'.format(cluster_name, mon_id))
+    _push_directory(local_mstore, mon, mon_store_dir)
+    mon.run(args=['sudo', 'chown', '-R', 'ceph:ceph', mon_store_dir])
+    shutil.rmtree(local_mstore)
+
+    # fill up the caps in the keyring file
+    mon.run(args=['sudo',
+                  'ceph-authtool', keyring_path,
+                  '-n', 'mon.',
+                  '--cap', 'mon', 'allow *'])
+    mon.run(args=['sudo',
+                  'ceph-authtool', keyring_path,
+                  '-n', 'client.admin',
+                  '--cap', 'mon', 'allow *',
+                  '--cap', 'osd', 'allow *',
+                  '--cap', 'mds', 'allow *',
+                  '--cap', 'mgr', 'allow *'])
+    mon.run(args=['sudo', '-u', 'ceph',
+                  'CEPH_ARGS=--no-mon-config',
+                  'ceph-monstore-tool', mon_store_dir,
+                  'rebuild', '--',
+                  '--keyring', keyring_path,
+                  '--monmap', '/tmp/monmap',
+                  ])
+
+
+def _revive_mons(manager, mons, recovered, keyring_path):
+    # revive monitors
+    # the initial monmap is in the ceph.conf, so we are good.
+    n_mons = 0
+    is_mon = teuthology.is_type('mon')
+    for remote, roles in mons.remotes.items():
+        for role in roles:
+            if not is_mon(role):
+                continue
+            cluster, _, m = teuthology.split_role(role)
+            if recovered != m:
+                log.info('running mkfs on {cluster}:mon.{mon}'.format(
+                    cluster=cluster,
+                    mon=m))
+                remote.run(
+                    args=[
+                        'sudo',
+                        'ceph-mon',
+                        '--cluster', cluster,
+                        '--mkfs',
+                        '-i', m,
+                        '--keyring', keyring_path,
+                        '--monmap', '/tmp/monmap'])
+            log.info('reviving mon.{0}'.format(m))
+            manager.revive_mon(m)
+            n_mons += 1
+    manager.wait_for_mon_quorum_size(n_mons, timeout=30)
+
+
+def _revive_mgrs(ctx, manager):
+    is_mgr = teuthology.is_type('mgr')
+    mgrs = ctx.cluster.only(is_mgr)
+    for _, roles in mgrs.remotes.items():
+        for role in roles:
+            if not is_mgr(role):
+                continue
+            _, _, mgr_id = teuthology.split_role(role)
+            log.info('reviving mgr.{0}'.format(mgr_id))
+            manager.revive_mgr(mgr_id)
+
+
+def _revive_osds(ctx, manager):
+    is_osd = teuthology.is_type('osd')
+    osds = ctx.cluster.only(is_osd)
+    for _, roles in osds.remotes.items():
+        for role in roles:
+            if not is_osd(role):
+                continue
+            _, _, osd_id = teuthology.split_role(role)
+            log.info('reviving osd.{0}'.format(osd_id))
+            manager.revive_osd(osd_id)
+
+
+def task(ctx, config):
+    """
+    Test monitor recovery from OSD
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'task only accepts a dict for configuration'
+
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+
+    # stash a monmap for later
+    mon.run(args=['ceph', 'mon', 'getmap', '-o', '/tmp/monmap'])
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'))
+
+    mons = ctx.cluster.only(teuthology.is_type('mon'))
+    # note down the first cluster_name and mon_id
+    # we will recover it later on
+    cluster_name, _, mon_id = teuthology.split_role(first_mon)
+    _nuke_mons(manager, mons, mon_id)
+    default_keyring = '/etc/ceph/{cluster}.keyring'.format(
+        cluster=cluster_name)
+    keyring_path = config.get('keyring_path', default_keyring)
+    _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path)
+    _revive_mons(manager, mons, mon_id, keyring_path)
+    _revive_mgrs(ctx, manager)
+    _revive_osds(ctx, manager)
diff --git a/qa/tasks/reg11184.py b/qa/tasks/reg11184.py
new file mode 100644
index 000000000..86cfbf39a
--- /dev/null
+++ b/qa/tasks/reg11184.py
@@ -0,0 +1,242 @@
+"""
+Special regression test for tracker #11184
+
+Synopsis: osd/SnapMapper.cc: 282: FAILED assert(check(oid))
+
+This is accomplished by moving a pg that wasn't part of split and still include
+divergent priors.
+"""
+import logging
+import time
+
+from teuthology.exceptions import CommandFailedError
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+from tasks.util.rados import rados
+import os
+
+
+log = logging.getLogger(__name__)
+
+
+def task(ctx, config):
+    """
+    Test handling of divergent entries during export / import
+    to regression test tracker #11184
+
+    overrides:
+      ceph:
+        conf:
+          osd:
+            debug osd: 5
+
+    Requires 3 osds on a single test node.
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'divergent_priors task only accepts a dict for configuration'
+
+    manager = ctx.managers['ceph']
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+    osds = [0, 1, 2]
+    manager.flush_pg_stats(osds)
+    manager.raw_cluster_cmd('osd', 'set', 'noout')
+    manager.raw_cluster_cmd('osd', 'set', 'noin')
+    manager.raw_cluster_cmd('osd', 'set', 'nodown')
+    manager.wait_for_clean()
+
+    # something that is always there
+    dummyfile = '/etc/fstab'
+    dummyfile2 = '/etc/resolv.conf'
+    testdir = teuthology.get_testdir(ctx)
+
+    # create 1 pg pool
+    log.info('creating foo')
+    manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1')
+    manager.raw_cluster_cmd(
+        'osd', 'pool', 'application', 'enable',
+        'foo', 'rados', run.Raw('||'), 'true')
+
+    # Remove extra pool to simlify log output
+    manager.raw_cluster_cmd('osd', 'pool', 'delete', 'rbd', 'rbd', '--yes-i-really-really-mean-it')
+
+    for i in osds:
+        manager.set_config(i, osd_min_pg_log_entries=10)
+        manager.set_config(i, osd_max_pg_log_entries=10)
+        manager.set_config(i, osd_pg_log_trim_min=5)
+
+    # determine primary
+    divergent = manager.get_pg_primary('foo', 0)
+    log.info("primary and soon to be divergent is %d", divergent)
+    non_divergent = list(osds)
+    non_divergent.remove(divergent)
+
+    log.info('writing initial objects')
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+    # write 100 objects
+    for i in range(100):
+        rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile])
+
+    manager.wait_for_clean()
+
+    # blackhole non_divergent
+    log.info("blackholing osds %s", str(non_divergent))
+    for i in non_divergent:
+        manager.set_config(i, objectstore_blackhole=1)
+
+    DIVERGENT_WRITE = 5
+    DIVERGENT_REMOVE = 5
+    # Write some soon to be divergent
+    log.info('writing divergent objects')
+    for i in range(DIVERGENT_WRITE):
+        rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i,
+                         dummyfile2], wait=False)
+    # Remove some soon to be divergent
+    log.info('remove divergent objects')
+    for i in range(DIVERGENT_REMOVE):
+        rados(ctx, mon, ['-p', 'foo', 'rm',
+                         'existing_%d' % (i + DIVERGENT_WRITE)], wait=False)
+    time.sleep(10)
+    mon.run(
+        args=['killall', '-9', 'rados'],
+        wait=True,
+        check_status=False)
+
+    # kill all the osds but leave divergent in
+    log.info('killing all the osds')
+    for i in osds:
+        manager.kill_osd(i)
+    for i in osds:
+        manager.mark_down_osd(i)
+    for i in non_divergent:
+        manager.mark_out_osd(i)
+
+    # bring up non-divergent
+    log.info("bringing up non_divergent %s", str(non_divergent))
+    for i in non_divergent:
+        manager.revive_osd(i)
+    for i in non_divergent:
+        manager.mark_in_osd(i)
+
+    # write 1 non-divergent object (ensure that old divergent one is divergent)
+    objname = "existing_%d" % (DIVERGENT_WRITE + DIVERGENT_REMOVE)
+    log.info('writing non-divergent object ' + objname)
+    rados(ctx, mon, ['-p', 'foo', 'put', objname, dummyfile2])
+
+    manager.wait_for_recovery()
+
+    # ensure no recovery of up osds first
+    log.info('delay recovery')
+    for i in non_divergent:
+        manager.wait_run_admin_socket(
+            'osd', i, ['set_recovery_delay', '100000'])
+
+    # bring in our divergent friend
+    log.info("revive divergent %d", divergent)
+    manager.raw_cluster_cmd('osd', 'set', 'noup')
+    manager.revive_osd(divergent)
+
+    log.info('delay recovery divergent')
+    manager.wait_run_admin_socket(
+        'osd', divergent, ['set_recovery_delay', '100000'])
+
+    manager.raw_cluster_cmd('osd', 'unset', 'noup')
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+
+    log.info('wait for peering')
+    rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
+
+    # At this point the divergent_priors should have been detected
+
+    log.info("killing divergent %d", divergent)
+    manager.kill_osd(divergent)
+
+    # Split pgs for pool foo
+    manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'pg_num', '2')
+    time.sleep(5)
+
+    manager.raw_cluster_cmd('pg','dump')
+
+    # Export a pg
+    (exp_remote,) = ctx.\
+        cluster.only('osd.{o}'.format(o=divergent)).remotes.keys()
+    FSPATH = manager.get_filepath()
+    JPATH = os.path.join(FSPATH, "journal")
+    prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
+              "--data-path {fpath} --journal-path {jpath} "
+              "--log-file="
+              "/var/log/ceph/objectstore_tool.$$.log ".
+              format(fpath=FSPATH, jpath=JPATH))
+    pid = os.getpid()
+    expfile = os.path.join(testdir, "exp.{pid}.out".format(pid=pid))
+    cmd = ((prefix + "--op export-remove --pgid 2.0 --file {file}").
+           format(id=divergent, file=expfile))
+    try:
+        exp_remote.sh(cmd, wait=True)
+    except CommandFailedError as e:
+        assert e.exitstatus == 0
+
+    # Kill one of non-divergent OSDs
+    log.info('killing osd.%d' % non_divergent[0])
+    manager.kill_osd(non_divergent[0])
+    manager.mark_down_osd(non_divergent[0])
+    # manager.mark_out_osd(non_divergent[0])
+
+    # An empty collection for pg 2.0 might need to be cleaned up
+    cmd = ((prefix + "--force --op remove --pgid 2.0").
+           format(id=non_divergent[0]))
+    exp_remote.sh(cmd, wait=True, check_status=False)
+
+    cmd = ((prefix + "--op import --file {file}").
+           format(id=non_divergent[0], file=expfile))
+    try:
+        exp_remote.sh(cmd, wait=True)
+    except CommandFailedError as e:
+        assert e.exitstatus == 0
+
+    # bring in our divergent friend and other node
+    log.info("revive divergent %d", divergent)
+    manager.revive_osd(divergent)
+    manager.mark_in_osd(divergent)
+    log.info("revive %d", non_divergent[0])
+    manager.revive_osd(non_divergent[0])
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+
+    log.info('delay recovery divergent')
+    manager.set_config(divergent, osd_recovery_delay_start=100000)
+    log.info('mark divergent in')
+    manager.mark_in_osd(divergent)
+
+    log.info('wait for peering')
+    rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
+
+    log.info("killing divergent %d", divergent)
+    manager.kill_osd(divergent)
+    log.info("reviving divergent %d", divergent)
+    manager.revive_osd(divergent)
+    time.sleep(3)
+
+    log.info('allowing recovery')
+    # Set osd_recovery_delay_start back to 0 and kick the queue
+    for i in osds:
+        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'debug',
+                                'kick_recovery_wq', ' 0')
+
+    log.info('reading divergent objects')
+    for i in range(DIVERGENT_WRITE + DIVERGENT_REMOVE):
+        exit_status = rados(ctx, mon, ['-p', 'foo', 'get', 'existing_%d' % i,
+                                       '/tmp/existing'])
+        assert exit_status == 0
+
+    (remote,) = ctx.\
+        cluster.only('osd.{o}'.format(o=divergent)).remotes.keys()
+    cmd = 'rm {file}'.format(file=expfile)
+    remote.run(args=cmd, wait=True)
+    log.info("success")
diff --git a/qa/tasks/rep_lost_unfound_delete.py b/qa/tasks/rep_lost_unfound_delete.py
new file mode 100644
index 000000000..8e99ade27
--- /dev/null
+++ b/qa/tasks/rep_lost_unfound_delete.py
@@ -0,0 +1,179 @@
+"""
+Lost_unfound
+"""
+import logging
+import time
+
+from tasks import ceph_manager
+from tasks.util.rados import rados
+from teuthology import misc as teuthology
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Test handling of lost objects.
+
+    A pretty rigid cluster is brought up and tested by this task
+    """
+    POOL = 'unfounddel_pool'
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'lost_unfound task only accepts a dict for configuration'
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+    manager.flush_pg_stats([0, 1, 2])
+    manager.wait_for_clean()
+
+    manager.create_pool(POOL)
+
+    # something that is always there
+    dummyfile = '/etc/fstab'
+
+    # take an osd out until the very end
+    manager.kill_osd(2)
+    manager.mark_down_osd(2)
+    manager.mark_out_osd(2)
+
+    # kludge to make sure they get a map
+    rados(ctx, mon, ['-p', POOL, 'put', 'dummy', dummyfile])
+
+    manager.flush_pg_stats([0, 1])
+    manager.wait_for_recovery()
+
+    # create old objects
+    for f in range(1, 10):
+        rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', POOL, 'rm', 'existed_%d' % f])
+
+    # delay recovery, and make the pg log very long (to prevent backfill)
+    manager.raw_cluster_cmd(
+            'tell', 'osd.1',
+            'injectargs',
+            '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000'
+            )
+
+    manager.kill_osd(0)
+    manager.mark_down_osd(0)
+    
+    for f in range(1, 10):
+        rados(ctx, mon, ['-p', POOL, 'put', 'new_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile])
+
+    # bring osd.0 back up, let it peer, but don't replicate the new
+    # objects...
+    log.info('osd.0 command_args is %s' % 'foo')
+    log.info(ctx.daemons.get_daemon('osd', 0).command_args)
+    ctx.daemons.get_daemon('osd', 0).command_kwargs['args'].extend([
+            '--osd-recovery-delay-start', '1000'
+            ])
+    manager.revive_osd(0)
+    manager.mark_in_osd(0)
+    manager.wait_till_osd_is_up(0)
+
+    manager.flush_pg_stats([0, 1])
+    manager.wait_till_active()
+
+    # take out osd.1 and the only copy of those objects.
+    manager.kill_osd(1)
+    manager.mark_down_osd(1)
+    manager.mark_out_osd(1)
+    manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it')
+
+    # bring up osd.2 so that things would otherwise, in theory, recovery fully
+    manager.revive_osd(2)
+    manager.mark_in_osd(2)
+    manager.wait_till_osd_is_up(2)
+
+    manager.flush_pg_stats([0, 2])
+    manager.wait_till_active()
+    manager.flush_pg_stats([0, 2])
+
+    # verify that there are unfound objects
+    unfound = manager.get_num_unfound_objects()
+    log.info("there are %d unfound objects" % unfound)
+    assert unfound
+
+    testdir = teuthology.get_testdir(ctx)
+    procs = []
+    if config.get('parallel_bench', True):
+        procs.append(mon.run(
+            args=[
+                "/bin/sh", "-c",
+                " ".join(['adjust-ulimits',
+                          'ceph-coverage',
+                          '{tdir}/archive/coverage',
+                          'rados',
+                          '--no-log-to-stderr',
+                          '--name', 'client.admin',
+                          '-b', str(4<<10),
+                          '-p' , POOL,
+                          '-t', '20',
+                          'bench', '240', 'write',
+                      ]).format(tdir=testdir),
+            ],
+            logger=log.getChild('radosbench.{id}'.format(id='client.admin')),
+            stdin=run.PIPE,
+            wait=False
+        ))
+    time.sleep(10)
+
+    # mark stuff lost
+    pgs = manager.get_pg_stats()
+    for pg in pgs:
+        if pg['stat_sum']['num_objects_unfound'] > 0:
+            primary = 'osd.%d' % pg['acting'][0]
+
+            # verify that i can list them direct from the osd
+            log.info('listing missing/lost in %s state %s', pg['pgid'],
+                     pg['state']);
+            m = manager.list_pg_unfound(pg['pgid'])
+            #log.info('%s' % m)
+            assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound']
+            num_unfound=0
+            for o in m['objects']:
+                if len(o['locations']) == 0:
+                    num_unfound += 1
+            assert m['num_unfound'] == num_unfound
+
+            log.info("reverting unfound in %s on %s", pg['pgid'], primary)
+            manager.raw_cluster_cmd('pg', pg['pgid'],
+                                    'mark_unfound_lost', 'delete')
+        else:
+            log.info("no unfound in %s", pg['pgid'])
+
+    manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5')
+    manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5')
+    manager.flush_pg_stats([0, 2])
+    manager.wait_for_recovery()
+
+    # verify result
+    for f in range(1, 10):
+        err = rados(ctx, mon, ['-p', POOL, 'get', 'new_%d' % f, '-'])
+        assert err
+        err = rados(ctx, mon, ['-p', POOL, 'get', 'existed_%d' % f, '-'])
+        assert err
+        err = rados(ctx, mon, ['-p', POOL, 'get', 'existing_%d' % f, '-'])
+        assert err
+
+    # see if osd.1 can cope
+    manager.mark_in_osd(1)
+    manager.revive_osd(1)
+    manager.wait_till_osd_is_up(1)
+    manager.wait_for_clean()
+    run.wait(procs)
+    manager.wait_for_clean()
+
diff --git a/qa/tasks/repair_test.py b/qa/tasks/repair_test.py
new file mode 100644
index 000000000..cfd6ef791
--- /dev/null
+++ b/qa/tasks/repair_test.py
@@ -0,0 +1,303 @@
+"""
+Test pool repairing after objects are damaged.
+"""
+import logging
+import time
+
+log = logging.getLogger(__name__)
+
+
+def choose_primary(manager, pool, num):
+    """
+    Return primary to test on.
+    """
+    log.info("Choosing primary")
+    return manager.get_pg_primary(pool, num)
+
+
+def choose_replica(manager, pool, num):
+    """
+    Return replica to test on.
+    """
+    log.info("Choosing replica")
+    return manager.get_pg_replica(pool, num)
+
+
+def trunc(manager, osd, pool, obj):
+    """
+    truncate an object
+    """
+    log.info("truncating object")
+    return manager.osd_admin_socket(
+        osd,
+        ['truncobj', pool, obj, '1'])
+
+
+def dataerr(manager, osd, pool, obj):
+    """
+    cause an error in the data
+    """
+    log.info("injecting data err on object")
+    return manager.osd_admin_socket(
+        osd,
+        ['injectdataerr', pool, obj])
+
+
+def mdataerr(manager, osd, pool, obj):
+    """
+    cause an error in the mdata
+    """
+    log.info("injecting mdata err on object")
+    return manager.osd_admin_socket(
+        osd,
+        ['injectmdataerr', pool, obj])
+
+
+def omaperr(manager, osd, pool, obj):
+    """
+    Cause an omap error.
+    """
+    log.info("injecting omap err on object")
+    return manager.osd_admin_socket(osd, ['setomapval', pool, obj,
+                                              'badkey', 'badval'])
+
+
+def repair_test_1(manager, corrupter, chooser, scrub_type):
+    """
+    Creates an object in the pool, corrupts it,
+    scrubs it, and verifies that the pool is inconsistent.  It then repairs
+    the pool, rescrubs it, and verifies that the pool is consistent
+
+    :param corrupter: error generating function (truncate, data-error, or
+     meta-data error, for example).
+    :param chooser: osd type chooser (primary or replica)
+    :param scrub_type: regular scrub or deep-scrub
+    """
+    pool = "repair_pool_1"
+    manager.wait_for_clean()
+    with manager.pool(pool, 1):
+
+        log.info("starting repair test type 1")
+        victim_osd = chooser(manager, pool, 0)
+
+        # create object
+        log.info("doing put")
+        manager.do_put(pool, 'repair_test_obj', '/etc/hosts')
+
+        # corrupt object
+        log.info("corrupting object")
+        corrupter(manager, victim_osd, pool, 'repair_test_obj')
+
+        # verify inconsistent
+        log.info("scrubbing")
+        manager.do_pg_scrub(pool, 0, scrub_type)
+
+        manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s)
+
+        # repair
+        log.info("repairing")
+        manager.do_pg_scrub(pool, 0, "repair")
+
+        log.info("re-scrubbing")
+        manager.do_pg_scrub(pool, 0, scrub_type)
+
+        # verify consistent
+        manager.with_pg_state(pool, 0, lambda s: 'inconsistent' not in s)
+        log.info("done")
+
+
+def repair_test_2(ctx, manager, config, chooser):
+    """
+    First creates a set of objects and
+    sets the omap value.  It then corrupts an object, does both a scrub
+    and a deep-scrub, and then corrupts more objects.  After that, it
+    repairs the pool and makes sure that the pool is consistent some
+    time after a deep-scrub.
+
+    :param chooser: primary or replica selection routine.
+    """
+    pool = "repair_pool_2"
+    manager.wait_for_clean()
+    with manager.pool(pool, 1):
+        log.info("starting repair test type 2")
+        victim_osd = chooser(manager, pool, 0)
+
+        # create object
+        log.info("doing put and setomapval")
+        manager.do_put(pool, 'file1', '/etc/hosts')
+        manager.do_rados(['setomapval', 'file1', 'key', 'val'], pool=pool)
+        manager.do_put(pool, 'file2', '/etc/hosts')
+        manager.do_put(pool, 'file3', '/etc/hosts')
+        manager.do_put(pool, 'file4', '/etc/hosts')
+        manager.do_put(pool, 'file5', '/etc/hosts')
+        manager.do_rados(['setomapval', 'file5', 'key', 'val'], pool=pool)
+        manager.do_put(pool, 'file6', '/etc/hosts')
+
+        # corrupt object
+        log.info("corrupting object")
+        omaperr(manager, victim_osd, pool, 'file1')
+
+        # verify inconsistent
+        log.info("scrubbing")
+        manager.do_pg_scrub(pool, 0, 'deep-scrub')
+
+        manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s)
+
+        # Regression test for bug #4778, should still
+        # be inconsistent after scrub
+        manager.do_pg_scrub(pool, 0, 'scrub')
+
+        manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s)
+
+        # Additional corruptions including 2 types for file1
+        log.info("corrupting more objects")
+        dataerr(manager, victim_osd, pool, 'file1')
+        mdataerr(manager, victim_osd, pool, 'file2')
+        trunc(manager, victim_osd, pool, 'file3')
+        omaperr(manager, victim_osd, pool, 'file6')
+
+        # see still inconsistent
+        log.info("scrubbing")
+        manager.do_pg_scrub(pool, 0, 'deep-scrub')
+
+        manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s)
+
+        # repair
+        log.info("repairing")
+        manager.do_pg_scrub(pool, 0, "repair")
+
+        # Let repair clear inconsistent flag
+        time.sleep(10)
+
+        # verify consistent
+        manager.with_pg_state(pool, 0, lambda s: 'inconsistent' not in s)
+
+        # In the future repair might determine state of
+        # inconsistency itself, verify with a deep-scrub
+        log.info("scrubbing")
+        manager.do_pg_scrub(pool, 0, 'deep-scrub')
+
+        # verify consistent
+        manager.with_pg_state(pool, 0, lambda s: 'inconsistent' not in s)
+
+        log.info("done")
+
+
+def hinfoerr(manager, victim, pool, obj):
+    """
+    cause an error in the hinfo_key
+    """
+    log.info("remove the hinfo_key")
+    manager.objectstore_tool(pool,
+                             options='',
+                             args='rm-attr hinfo_key',
+                             object_name=obj,
+                             osd=victim)
+
+
+def repair_test_erasure_code(manager, corrupter, victim, scrub_type):
+    """
+    Creates an object in the pool, corrupts it,
+    scrubs it, and verifies that the pool is inconsistent.  It then repairs
+    the pool, rescrubs it, and verifies that the pool is consistent
+
+    :param corrupter: error generating function.
+    :param chooser: osd type chooser (primary or replica)
+    :param scrub_type: regular scrub or deep-scrub
+    """
+    pool = "repair_pool_3"
+    manager.wait_for_clean()
+    with manager.pool(pool_name=pool, pg_num=1,
+                          erasure_code_profile_name='default'):
+
+        log.info("starting repair test for erasure code")
+
+        # create object
+        log.info("doing put")
+        manager.do_put(pool, 'repair_test_obj', '/etc/hosts')
+
+        # corrupt object
+        log.info("corrupting object")
+        corrupter(manager, victim, pool, 'repair_test_obj')
+
+        # verify inconsistent
+        log.info("scrubbing")
+        manager.do_pg_scrub(pool, 0, scrub_type)
+
+        manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s)
+
+        # repair
+        log.info("repairing")
+        manager.do_pg_scrub(pool, 0, "repair")
+
+        log.info("re-scrubbing")
+        manager.do_pg_scrub(pool, 0, scrub_type)
+
+        # verify consistent
+        manager.with_pg_state(pool, 0, lambda s: 'inconsistent' not in s)
+        log.info("done")
+
+
+def task(ctx, config):
+    """
+    Test [deep] repair in several situations:
+      Repair [Truncate, Data EIO, MData EIO] on [Primary|Replica]
+
+    The config should be as follows:
+
+      Must include the log-ignorelist below
+      Must enable filestore_debug_inject_read_err config
+
+    example:
+
+    tasks:
+    - chef:
+    - install:
+    - ceph:
+        log-ignorelist:
+          - 'candidate had a stat error'
+          - 'candidate had a read error'
+          - 'deep-scrub 0 missing, 1 inconsistent objects'
+          - 'deep-scrub 0 missing, 4 inconsistent objects'
+          - 'deep-scrub [0-9]+ errors'
+          - '!= omap_digest'
+          - '!= data_digest'
+          - 'repair 0 missing, 1 inconsistent objects'
+          - 'repair 0 missing, 4 inconsistent objects'
+          - 'repair [0-9]+ errors, [0-9]+ fixed'
+          - 'scrub 0 missing, 1 inconsistent objects'
+          - 'scrub [0-9]+ errors'
+          - 'size 1 != size'
+          - 'attr name mismatch'
+          - 'Regular scrub request, deep-scrub details will be lost'
+          - 'candidate size [0-9]+ info size [0-9]+ mismatch'
+        conf:
+          osd:
+            filestore debug inject read err: true
+    - repair_test:
+
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'repair_test task only accepts a dict for config'
+
+    manager = ctx.managers['ceph']
+    manager.wait_for_all_osds_up()
+
+    manager.raw_cluster_cmd('osd', 'set', 'noscrub')
+    manager.raw_cluster_cmd('osd', 'set', 'nodeep-scrub')
+
+    repair_test_1(manager, mdataerr, choose_primary, "scrub")
+    repair_test_1(manager, mdataerr, choose_replica, "scrub")
+    repair_test_1(manager, dataerr, choose_primary, "deep-scrub")
+    repair_test_1(manager, dataerr, choose_replica, "deep-scrub")
+    repair_test_1(manager, trunc, choose_primary, "scrub")
+    repair_test_1(manager, trunc, choose_replica, "scrub")
+    repair_test_2(ctx, manager, config, choose_primary)
+    repair_test_2(ctx, manager, config, choose_replica)
+
+    repair_test_erasure_code(manager, hinfoerr, 'primary', "deep-scrub")
+
+    manager.raw_cluster_cmd('osd', 'unset', 'noscrub')
+    manager.raw_cluster_cmd('osd', 'unset', 'nodeep-scrub')
diff --git a/qa/tasks/resolve_stuck_peering.py b/qa/tasks/resolve_stuck_peering.py
new file mode 100644
index 000000000..d140544c4
--- /dev/null
+++ b/qa/tasks/resolve_stuck_peering.py
@@ -0,0 +1,112 @@
+"""
+Resolve stuck peering
+"""
+import logging
+import time
+
+from teuthology import misc as teuthology
+from tasks.util.rados import rados
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Test handling resolve stuck peering
+
+    requires 3 osds on a single test node
+    """
+    if config is None:
+        config = {}
+        assert isinstance(config, dict), \
+            'Resolve stuck peering only accepts a dict for config'
+
+    manager = ctx.managers['ceph']
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+
+
+    manager.wait_for_clean()
+
+    dummyfile = '/etc/fstab'
+    dummyfile1 = '/etc/resolv.conf'
+
+    #create 1 PG pool
+    pool='foo'
+    log.info('creating pool foo')
+    manager.raw_cluster_cmd('osd', 'pool', 'create', '%s' % pool, '1')
+
+    #set min_size of the pool to 1
+    #so that we can continue with I/O
+    #when 2 osds are down
+    manager.set_pool_property(pool, "min_size", 1)
+
+    osds = [0, 1, 2]
+
+    primary = manager.get_pg_primary('foo', 0)
+    log.info("primary osd is %d", primary)
+
+    others = list(osds)
+    others.remove(primary)
+
+    log.info('writing initial objects')
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+    #create few objects
+    for i in range(100):
+        rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile])
+
+    manager.wait_for_clean()
+
+    #kill other osds except primary
+    log.info('killing other osds except primary')
+    for i in others:
+        manager.kill_osd(i)
+    for i in others:
+        manager.mark_down_osd(i)
+
+
+    for i in range(100):
+        rados(ctx, mon, ['-p', 'foo', 'put', 'new_%d' % i, dummyfile1])
+
+    #kill primary osd
+    manager.kill_osd(primary)
+    manager.mark_down_osd(primary)
+
+    #revive other 2 osds
+    for i in others:
+        manager.revive_osd(i)
+
+    #make sure that pg is down
+    #Assuming pg number for single pg pool will start from 0
+    pgnum=0
+    pgstr = manager.get_pgid(pool, pgnum)
+    stats = manager.get_single_pg_stats(pgstr)
+    print(stats['state'])
+
+    timeout=60
+    start=time.time()
+
+    while 'down' not in stats['state']:
+        assert time.time() - start < timeout, \
+            'failed to reach down state before timeout expired'
+        stats = manager.get_single_pg_stats(pgstr)
+
+    #mark primary as lost
+    manager.raw_cluster_cmd('osd', 'lost', '%d' % primary,\
+                            '--yes-i-really-mean-it')
+
+
+    #expect the pg status to be active+undersized+degraded
+    #pg should recover and become active+clean within timeout
+    stats = manager.get_single_pg_stats(pgstr)
+    print(stats['state'])
+
+    timeout=10
+    start=time.time()
+
+    while manager.get_num_down():
+        assert time.time() - start < timeout, \
+            'failed to recover before timeout expired'
+
+    manager.revive_osd(primary)
diff --git a/qa/tasks/restart.py b/qa/tasks/restart.py
new file mode 100644
index 000000000..4053bd2cb
--- /dev/null
+++ b/qa/tasks/restart.py
@@ -0,0 +1,161 @@
+"""
+Daemon restart
+"""
+import logging
+import pipes
+import os
+
+from teuthology import misc as teuthology
+from teuthology.orchestra import run as tor
+
+from teuthology.orchestra import run
+log = logging.getLogger(__name__)
+
+def restart_daemon(ctx, config, role, id_, *args):
+    """
+    Handle restart (including the execution of the command parameters passed)
+    """
+    log.info('Restarting {r}.{i} daemon...'.format(r=role, i=id_))
+    daemon = ctx.daemons.get_daemon(role, id_)
+    log.debug('Waiting for exit of {r}.{i} daemon...'.format(r=role, i=id_))
+    try:
+        daemon.wait_for_exit()
+    except tor.CommandFailedError as e:
+        log.debug('Command Failed: {e}'.format(e=e))
+    if len(args) > 0:
+        confargs = ['--{k}={v}'.format(k=k, v=v) for k,v in zip(args[0::2], args[1::2])]
+        log.debug('Doing restart of {r}.{i} daemon with args: {a}...'.format(r=role, i=id_, a=confargs))
+        daemon.restart_with_args(confargs)
+    else:
+        log.debug('Doing restart of {r}.{i} daemon...'.format(r=role, i=id_))
+        daemon.restart()
+
+def get_tests(ctx, config, role, remote, testdir):
+    """Download restart tests"""
+    srcdir = '{tdir}/restart.{role}'.format(tdir=testdir, role=role)
+
+    refspec = config.get('branch')
+    if refspec is None:
+        refspec = config.get('sha1')
+    if refspec is None:
+        refspec = config.get('tag')
+    if refspec is None:
+        refspec = 'HEAD'
+    log.info('Pulling restart qa/workunits from ref %s', refspec)
+
+    remote.run(
+        logger=log.getChild(role),
+        args=[
+            'mkdir', '--', srcdir,
+            run.Raw('&&'),
+            'git',
+            'clone',
+            'https://git.ceph.com/ceph.git',
+            srcdir,
+            run.Raw('&&'),
+            'cd', '--', srcdir,
+            run.Raw('&&'),
+            'git', 'checkout', '-b', 'restart_test', str(refspec),
+            run.Raw('&&'),
+            'cd', '--', 'qa/workunits',
+            run.Raw('&&'),
+            'if', 'test', '-e', 'Makefile', run.Raw(';'), 'then', 'make', run.Raw(';'), 'fi',
+            run.Raw('&&'),
+            'find', '-executable', '-type', 'f', '-printf', r'%P\0',
+            run.Raw('>{tdir}/restarts.list'.format(tdir=testdir)),
+            ],
+        )
+    restarts = sorted(remote.read_file(f'{testdir}/restarts.list').decode().split('\0'))
+    return (os.path.join(srcdir, 'qa/workunits'), restarts)
+
+def task(ctx, config):
+    """
+    Execute commands and allow daemon restart with config options.
+    Each process executed can output to stdout restart commands of the form:
+        restart <role> <id> <conf_key1> <conf_value1> <conf_key2> <conf_value2>
+    This will restart the daemon <role>.<id> with the specified config values once
+    by modifying the conf file with those values, and then replacing the old conf file
+    once the daemon is restarted.
+    This task does not kill a running daemon, it assumes the daemon will abort on an
+    assert specified in the config.
+
+        tasks:
+        - install:
+        - ceph:
+        - restart:
+            exec:
+              client.0:
+                - test_backtraces.py
+
+    """
+    assert isinstance(config, dict), "task kill got invalid config"
+
+    testdir = teuthology.get_testdir(ctx)
+
+    try:
+        assert 'exec' in config, "config requires exec key with <role>: <command> entries"
+        for role, task in config['exec'].items():
+            log.info('restart for role {r}'.format(r=role))
+            (remote,) = ctx.cluster.only(role).remotes.keys()
+            srcdir, restarts = get_tests(ctx, config, role, remote, testdir)
+            log.info('Running command on role %s host %s', role, remote.name)
+            spec = '{spec}'.format(spec=task[0])
+            log.info('Restarts list: %s', restarts)
+            log.info('Spec is %s', spec)
+            to_run = [w for w in restarts if w == task or w.find(spec) != -1]
+            log.info('To run: %s', to_run)
+            for c in to_run:
+                log.info('Running restart script %s...', c)
+                args = [
+                    run.Raw('TESTDIR="{tdir}"'.format(tdir=testdir)),
+                    ]
+                env = config.get('env')
+                if env is not None:
+                    for var, val in env.items():
+                        quoted_val = pipes.quote(val)
+                        env_arg = '{var}={val}'.format(var=var, val=quoted_val)
+                        args.append(run.Raw(env_arg))
+                args.extend([
+                            'adjust-ulimits',
+                            'ceph-coverage',
+                            '{tdir}/archive/coverage'.format(tdir=testdir),
+                            '{srcdir}/{c}'.format(
+                                srcdir=srcdir,
+                                c=c,
+                                ),
+                            ])
+                proc = remote.run(
+                    args=args,
+                    stdout=tor.PIPE,
+                    stdin=tor.PIPE,
+                    stderr=log,
+                    wait=False,
+                    )
+                log.info('waiting for a command from script')
+                while True:
+                    l = proc.stdout.readline()
+                    if not l or l == '':
+                        break
+                    log.debug('script command: {c}'.format(c=l))
+                    ll = l.strip()
+                    cmd = ll.split(' ')
+                    if cmd[0] == "done":
+                        break
+                    assert cmd[0] == 'restart', "script sent invalid command request to kill task"
+                    # cmd should be: restart <role> <id> <conf_key1> <conf_value1> <conf_key2> <conf_value2>
+                    # or to clear, just: restart <role> <id>
+                    restart_daemon(ctx, config, cmd[1], cmd[2], *cmd[3:])
+                    proc.stdin.writelines(['restarted\n'])
+                    proc.stdin.flush()
+                try:
+                    proc.wait()
+                except tor.CommandFailedError:
+                    raise Exception('restart task got non-zero exit status from script: {s}'.format(s=c))
+    finally:
+        log.info('Finishing %s on %s...', task, role)
+        remote.run(
+            logger=log.getChild(role),
+            args=[
+                'rm', '-rf', '--', '{tdir}/restarts.list'.format(tdir=testdir), srcdir,
+                ],
+            )
diff --git a/qa/tasks/rgw.py b/qa/tasks/rgw.py
new file mode 100644
index 000000000..36627f682
--- /dev/null
+++ b/qa/tasks/rgw.py
@@ -0,0 +1,431 @@
+"""
+rgw routines
+"""
+import argparse
+import contextlib
+import logging
+
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.exceptions import ConfigError
+from tasks.ceph_manager import get_valgrind_args
+from tasks.util import get_remote_for_role
+from tasks.util.rgw import rgwadmin, wait_for_radosgw
+from tasks.util.rados import (create_ec_pool,
+                              create_replicated_pool,
+                              create_cache_pool)
+
+log = logging.getLogger(__name__)
+
+class RGWEndpoint:
+    def __init__(self, hostname=None, port=None, cert=None, dns_name=None, website_dns_name=None):
+        self.hostname = hostname
+        self.port = port
+        self.cert = cert
+        self.dns_name = dns_name
+        self.website_dns_name = website_dns_name
+
+    def url(self):
+        proto = 'https' if self.cert else 'http'
+        return '{proto}://{hostname}:{port}/'.format(proto=proto, hostname=self.hostname, port=self.port)
+
+@contextlib.contextmanager
+def start_rgw(ctx, config, clients):
+    """
+    Start rgw on remote sites.
+    """
+    log.info('Starting rgw...')
+    testdir = teuthology.get_testdir(ctx)
+    for client in clients:
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        cluster_name, daemon_type, client_id = teuthology.split_role(client)
+        client_with_id = daemon_type + '.' + client_id
+        client_with_cluster = cluster_name + '.' + client_with_id
+
+        client_config = config.get(client)
+        if client_config is None:
+            client_config = {}
+        log.info("rgw %s config is %s", client, client_config)
+        cmd_prefix = [
+            'sudo',
+            'adjust-ulimits',
+            'ceph-coverage',
+            '{tdir}/archive/coverage'.format(tdir=testdir),
+            'daemon-helper',
+            'term',
+            ]
+
+        rgw_cmd = ['radosgw']
+
+        log.info("Using %s as radosgw frontend", ctx.rgw.frontend)
+
+        endpoint = ctx.rgw.role_endpoints[client]
+        frontends = ctx.rgw.frontend
+        frontend_prefix = client_config.get('frontend_prefix', None)
+        if frontend_prefix:
+            frontends += ' prefix={pfx}'.format(pfx=frontend_prefix)
+
+        if endpoint.cert:
+            # add the ssl certificate path
+            frontends += ' ssl_certificate={}'.format(endpoint.cert.certificate)
+            if ctx.rgw.frontend == 'civetweb':
+                frontends += ' port={}s'.format(endpoint.port)
+            else:
+                frontends += ' ssl_port={}'.format(endpoint.port)
+        else:
+            frontends += ' port={}'.format(endpoint.port)
+
+        rgw_cmd.extend([
+            '--rgw-frontends', frontends,
+            '-n', client_with_id,
+            '--cluster', cluster_name,
+            '-k', '/etc/ceph/{client_with_cluster}.keyring'.format(client_with_cluster=client_with_cluster),
+            '--log-file',
+            '/var/log/ceph/rgw.{client_with_cluster}.log'.format(client_with_cluster=client_with_cluster),
+            '--rgw_ops_log_socket_path',
+            '{tdir}/rgw.opslog.{client_with_cluster}.sock'.format(tdir=testdir,
+                                                     client_with_cluster=client_with_cluster),
+	    ])
+
+        keystone_role = client_config.get('use-keystone-role', None)
+        if keystone_role is not None:
+            if not ctx.keystone:
+                raise ConfigError('rgw must run after the keystone task')
+            url = 'http://{host}:{port}/v1/KEY_$(tenant_id)s'.format(host=endpoint.hostname,
+                                                                     port=endpoint.port)
+            ctx.keystone.create_endpoint(ctx, keystone_role, 'swift', url)
+
+            keystone_host, keystone_port = \
+                ctx.keystone.public_endpoints[keystone_role]
+            rgw_cmd.extend([
+                '--rgw_keystone_url',
+                'http://{khost}:{kport}'.format(khost=keystone_host,
+                                                kport=keystone_port),
+                ])
+
+
+        if client_config.get('dns-name') is not None:
+            rgw_cmd.extend(['--rgw-dns-name', endpoint.dns_name])
+        if client_config.get('dns-s3website-name') is not None:
+            rgw_cmd.extend(['--rgw-dns-s3website-name', endpoint.website_dns_name])
+
+
+        vault_role = client_config.get('use-vault-role', None)
+        barbican_role = client_config.get('use-barbican-role', None)
+        pykmip_role = client_config.get('use-pykmip-role', None)
+
+        token_path = '/etc/ceph/vault-root-token'
+        if barbican_role is not None:
+            if not hasattr(ctx, 'barbican'):
+                raise ConfigError('rgw must run after the barbican task')
+
+            barbican_host, barbican_port = \
+                ctx.barbican.endpoints[barbican_role]
+            log.info("Use barbican url=%s:%s", barbican_host, barbican_port)
+
+            rgw_cmd.extend([
+                '--rgw_barbican_url',
+                'http://{bhost}:{bport}'.format(bhost=barbican_host,
+                                                bport=barbican_port),
+                ])
+        elif vault_role is not None:
+            if not ctx.vault.root_token:
+                raise ConfigError('vault: no "root_token" specified')
+            # create token on file
+            ctx.rgw.vault_role = vault_role
+            ctx.cluster.only(client).run(args=['sudo', 'echo', '-n', ctx.vault.root_token, run.Raw('|'), 'sudo', 'tee', token_path])
+            log.info("Token file content")
+            ctx.cluster.only(client).run(args=['cat', token_path])
+            log.info("Restrict access to token file")
+            ctx.cluster.only(client).run(args=['sudo', 'chmod', '600', token_path])
+            ctx.cluster.only(client).run(args=['sudo', 'chown', 'ceph', token_path])
+
+            rgw_cmd.extend([
+                '--rgw_crypt_vault_addr', "{}:{}".format(*ctx.vault.endpoints[vault_role]),
+                '--rgw_crypt_vault_token_file', token_path
+            ])
+        elif pykmip_role is not None:
+            if not hasattr(ctx, 'pykmip'):
+                raise ConfigError('rgw must run after the pykmip task')
+            ctx.rgw.pykmip_role = pykmip_role
+            rgw_cmd.extend([
+                '--rgw_crypt_kmip_addr', "{}:{}".format(*ctx.pykmip.endpoints[pykmip_role]),
+            ])
+
+            clientcert = ctx.ssl_certificates.get('kmip-client')
+            servercert = ctx.ssl_certificates.get('kmip-server')
+            clientca = ctx.ssl_certificates.get('kmiproot')
+
+            clientkey = clientcert.key
+            clientcert = clientcert.certificate
+            serverkey = servercert.key
+            servercert = servercert.certificate
+            rootkey = clientca.key
+            rootcert = clientca.certificate
+
+            cert_path = '/etc/ceph/'
+            ctx.cluster.only(client).run(args=['sudo', 'cp', clientcert, cert_path])
+            ctx.cluster.only(client).run(args=['sudo', 'cp', clientkey, cert_path])
+            ctx.cluster.only(client).run(args=['sudo', 'cp', servercert, cert_path])
+            ctx.cluster.only(client).run(args=['sudo', 'cp', serverkey, cert_path])
+            ctx.cluster.only(client).run(args=['sudo', 'cp', rootkey, cert_path])
+            ctx.cluster.only(client).run(args=['sudo', 'cp', rootcert, cert_path])
+
+            clientcert = cert_path + 'kmip-client.crt'
+            clientkey = cert_path + 'kmip-client.key'
+            servercert = cert_path + 'kmip-server.crt'
+            serverkey = cert_path + 'kmip-server.key'
+            rootkey = cert_path + 'kmiproot.key'
+            rootcert = cert_path + 'kmiproot.crt'
+
+            ctx.cluster.only(client).run(args=['sudo', 'chmod', '600', clientcert, clientkey, servercert, serverkey, rootkey, rootcert])
+            ctx.cluster.only(client).run(args=['sudo', 'chown', 'ceph', clientcert, clientkey, servercert, serverkey, rootkey, rootcert])
+
+        rgw_cmd.extend([
+            '--foreground',
+            run.Raw('|'),
+            'sudo',
+            'tee',
+            '/var/log/ceph/rgw.{client_with_cluster}.stdout'.format(client_with_cluster=client_with_cluster),
+            run.Raw('2>&1'),
+            ])
+
+        if client_config.get('valgrind'):
+            cmd_prefix = get_valgrind_args(
+                testdir,
+                client_with_cluster,
+                cmd_prefix,
+                client_config.get('valgrind')
+                )
+
+        run_cmd = list(cmd_prefix)
+        run_cmd.extend(rgw_cmd)
+
+        ctx.daemons.add_daemon(
+            remote, 'rgw', client_with_id,
+            cluster=cluster_name,
+            fsid=ctx.ceph[cluster_name].fsid,
+            args=run_cmd,
+            logger=log.getChild(client),
+            stdin=run.PIPE,
+            wait=False,
+            )
+
+    # XXX: add_daemon() doesn't let us wait until radosgw finishes startup
+    for client in clients:
+        endpoint = ctx.rgw.role_endpoints[client]
+        url = endpoint.url()
+        log.info('Polling {client} until it starts accepting connections on {url}'.format(client=client, url=url))
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        wait_for_radosgw(url, remote)
+
+    try:
+        yield
+    finally:
+        for client in clients:
+            cluster_name, daemon_type, client_id = teuthology.split_role(client)
+            client_with_id = daemon_type + '.' + client_id
+            client_with_cluster = cluster_name + '.' + client_with_id
+            ctx.daemons.get_daemon('rgw', client_with_id, cluster_name).stop()
+            ctx.cluster.only(client).run(
+                args=[
+                    'rm',
+                    '-f',
+                    '{tdir}/rgw.opslog.{client}.sock'.format(tdir=testdir,
+                                                             client=client_with_cluster),
+                    ],
+                )
+            ctx.cluster.only(client).run(args=['sudo', 'rm', '-f', token_path])
+
+def assign_endpoints(ctx, config, default_cert):
+    role_endpoints = {}
+    for role, client_config in config.items():
+        client_config = client_config or {}
+        remote = get_remote_for_role(ctx, role)
+
+        cert = client_config.get('ssl certificate', default_cert)
+        if cert:
+            # find the certificate created by the ssl task
+            if not hasattr(ctx, 'ssl_certificates'):
+                raise ConfigError('rgw: no ssl task found for option "ssl certificate"')
+            ssl_certificate = ctx.ssl_certificates.get(cert, None)
+            if not ssl_certificate:
+                raise ConfigError('rgw: missing ssl certificate "{}"'.format(cert))
+        else:
+            ssl_certificate = None
+
+        port = client_config.get('port', 443 if ssl_certificate else 80)
+
+        # if dns-name is given, use it as the hostname (or as a prefix)
+        dns_name = client_config.get('dns-name', '')
+        if len(dns_name) == 0 or dns_name.endswith('.'):
+            dns_name += remote.hostname
+
+        website_dns_name = client_config.get('dns-s3website-name')
+        if website_dns_name is not None and (len(website_dns_name) == 0 or website_dns_name.endswith('.')):
+            website_dns_name += remote.hostname
+
+        role_endpoints[role] = RGWEndpoint(remote.hostname, port, ssl_certificate, dns_name, website_dns_name)
+
+    return role_endpoints
+
+@contextlib.contextmanager
+def create_pools(ctx, clients):
+    """Create replicated or erasure coded data pools for rgw."""
+
+    log.info('Creating data pools')
+    for client in clients:
+        log.debug("Obtaining remote for client {}".format(client))
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        data_pool = 'default.rgw.buckets.data'
+        cluster_name, daemon_type, client_id = teuthology.split_role(client)
+
+        if ctx.rgw.ec_data_pool:
+            create_ec_pool(remote, data_pool, client, ctx.rgw.data_pool_pg_size,
+                           ctx.rgw.erasure_code_profile, cluster_name, 'rgw')
+        else:
+            create_replicated_pool(remote, data_pool, ctx.rgw.data_pool_pg_size, cluster_name, 'rgw')
+
+        index_pool = 'default.rgw.buckets.index'
+        create_replicated_pool(remote, index_pool, ctx.rgw.index_pool_pg_size, cluster_name, 'rgw')
+
+        if ctx.rgw.cache_pools:
+            create_cache_pool(remote, data_pool, data_pool + '.cache', 64,
+                              64*1024*1024, cluster_name)
+    log.debug('Pools created')
+    yield
+
+@contextlib.contextmanager
+def configure_compression(ctx, clients, compression):
+    """ set a compression type in the default zone placement """
+    log.info('Configuring compression type = %s', compression)
+    for client in clients:
+        # XXX: the 'default' zone and zonegroup aren't created until we run RGWRados::init_complete().
+        # issue a 'radosgw-admin user list' command to trigger this
+        rgwadmin(ctx, client, cmd=['user', 'list'], check_status=True)
+
+        rgwadmin(ctx, client,
+                cmd=['zone', 'placement', 'modify', '--rgw-zone', 'default',
+                     '--placement-id', 'default-placement',
+                     '--compression', compression],
+                check_status=True)
+    yield
+
+@contextlib.contextmanager
+def configure_storage_classes(ctx, clients, storage_classes):
+    """ set a compression type in the default zone placement """
+
+    sc = [s.strip() for s in storage_classes.split(',')]
+
+    for client in clients:
+        # XXX: the 'default' zone and zonegroup aren't created until we run RGWRados::init_complete().
+        # issue a 'radosgw-admin user list' command to trigger this
+        rgwadmin(ctx, client, cmd=['user', 'list'], check_status=True)
+
+        for storage_class in sc:
+            log.info('Configuring storage class type = %s', storage_class)
+            rgwadmin(ctx, client,
+                    cmd=['zonegroup', 'placement', 'add',
+                        '--rgw-zone', 'default',
+                        '--placement-id', 'default-placement',
+                        '--storage-class', storage_class],
+                    check_status=True)
+            rgwadmin(ctx, client,
+                    cmd=['zone', 'placement', 'add',
+                        '--rgw-zone', 'default',
+                        '--placement-id', 'default-placement',
+                        '--storage-class', storage_class,
+                        '--data-pool', 'default.rgw.buckets.data.' + storage_class.lower()],
+                    check_status=True)
+    yield
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    For example, to run rgw on all clients::
+
+        tasks:
+        - ceph:
+        - rgw:
+
+    To only run on certain clients::
+
+        tasks:
+        - ceph:
+        - rgw: [client.0, client.3]
+
+    or
+
+        tasks:
+        - ceph:
+        - rgw:
+            client.0:
+            client.3:
+
+    To run radosgw through valgrind:
+
+        tasks:
+        - ceph:
+        - rgw:
+            client.0:
+              valgrind: [--tool=memcheck]
+            client.3:
+              valgrind: [--tool=memcheck]
+
+    To configure data or index pool pg_size:
+
+        overrides:
+          rgw:
+            data_pool_pg_size: 256
+            index_pool_pg_size: 128
+    """
+    if config is None:
+        config = dict(('client.{id}'.format(id=id_), None)
+                      for id_ in teuthology.all_roles_of_type(
+                          ctx.cluster, 'client'))
+    elif isinstance(config, list):
+        config = dict((name, None) for name in config)
+
+    clients = config.keys() # http://tracker.ceph.com/issues/20417
+
+    overrides = ctx.config.get('overrides', {})
+    teuthology.deep_merge(config, overrides.get('rgw', {}))
+
+    ctx.rgw = argparse.Namespace()
+
+    ctx.rgw.ec_data_pool = bool(config.pop('ec-data-pool', False))
+    ctx.rgw.erasure_code_profile = config.pop('erasure_code_profile', {})
+    ctx.rgw.cache_pools = bool(config.pop('cache-pools', False))
+    ctx.rgw.frontend = config.pop('frontend', 'civetweb')
+    ctx.rgw.compression_type = config.pop('compression type', None)
+    ctx.rgw.storage_classes = config.pop('storage classes', None)
+    default_cert = config.pop('ssl certificate', None)
+    ctx.rgw.data_pool_pg_size = config.pop('data_pool_pg_size', 64)
+    ctx.rgw.index_pool_pg_size = config.pop('index_pool_pg_size', 64)
+    ctx.rgw.config = config
+
+    log.debug("config is {}".format(config))
+    log.debug("client list is {}".format(clients))
+
+    ctx.rgw.role_endpoints = assign_endpoints(ctx, config, default_cert)
+
+    subtasks = [
+        lambda: create_pools(ctx=ctx, clients=clients),
+    ]
+    if ctx.rgw.compression_type:
+        subtasks.extend([
+            lambda: configure_compression(ctx=ctx, clients=clients,
+                                          compression=ctx.rgw.compression_type),
+        ])
+    if ctx.rgw.storage_classes:
+        subtasks.extend([
+            lambda: configure_storage_classes(ctx=ctx, clients=clients,
+                                              storage_classes=ctx.rgw.storage_classes),
+        ])
+    subtasks.extend([
+        lambda: start_rgw(ctx=ctx, config=config, clients=clients),
+    ])
+
+    with contextutil.nested(*subtasks):
+        yield
diff --git a/qa/tasks/rgw_logsocket.py b/qa/tasks/rgw_logsocket.py
new file mode 100644
index 000000000..d76e59d7f
--- /dev/null
+++ b/qa/tasks/rgw_logsocket.py
@@ -0,0 +1,165 @@
+"""
+rgw s3tests logging wrappers
+"""
+from io import BytesIO
+from configobj import ConfigObj
+import contextlib
+import logging
+from tasks import s3tests
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def download(ctx, config):
+    """
+    Run s3tests download function
+    """
+    return s3tests.download(ctx, config)
+
+def _config_user(s3tests_conf, section, user):
+    """
+    Run s3tests user config function
+    """
+    return s3tests._config_user(s3tests_conf, section, user)
+
+@contextlib.contextmanager
+def create_users(ctx, config):
+    """
+    Run s3tests user create function
+    """
+    return s3tests.create_users(ctx, config)
+
+@contextlib.contextmanager
+def configure(ctx, config):
+    """
+    Run s3tests user configure function
+    """
+    return s3tests.configure(ctx, config)
+
+@contextlib.contextmanager
+def run_tests(ctx, config):
+    """
+    Run remote netcat tests
+    """
+    assert isinstance(config, dict)
+    testdir = teuthology.get_testdir(ctx)
+    for client, client_config in config.items():
+        client_config['extra_args'] = [
+            's3tests.functional.test_s3:test_bucket_list_return_data',
+        ]
+#        args = [
+#                'S3TEST_CONF={tdir}/archive/s3-tests.{client}.conf'.format(tdir=testdir, client=client),
+#                '{tdir}/s3-tests/virtualenv/bin/nosetests'.format(tdir=testdir),
+#                '-w',
+#                '{tdir}/s3-tests'.format(tdir=testdir),
+#                '-v',
+#		's3tests.functional.test_s3:test_bucket_list_return_data',
+#                ]
+#        if client_config is not None and 'extra_args' in client_config:
+#            args.extend(client_config['extra_args'])
+#
+#        ctx.cluster.only(client).run(
+#            args=args,
+#            )
+
+    s3tests.run_tests(ctx, config)
+
+    netcat_out = BytesIO()
+
+    for client, client_config in config.items():
+        ctx.cluster.only(client).run(
+            args = [
+                'netcat',
+                '-w', '5',
+                '-U', '{tdir}/rgw.opslog.sock'.format(tdir=testdir),
+                ],
+             stdout = netcat_out,
+        )
+
+        out = netcat_out.getvalue()
+
+        assert len(out) > 100
+
+        log.info('Received', out)
+
+    yield
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run some s3-tests suite against rgw, verify opslog socket returns data
+
+    Must restrict testing to a particular client::
+
+        tasks:
+        - ceph:
+        - rgw: [client.0]
+        - s3tests: [client.0]
+
+    To pass extra arguments to nose (e.g. to run a certain test)::
+
+        tasks:
+        - ceph:
+        - rgw: [client.0]
+        - s3tests:
+            client.0:
+              extra_args: ['test_s3:test_object_acl_grand_public_read']
+            client.1:
+              extra_args: ['--exclude', 'test_100_continue']
+    """
+    assert hasattr(ctx, 'rgw'), 'rgw-logsocket must run after the rgw task'
+    assert config is None or isinstance(config, list) \
+        or isinstance(config, dict), \
+        "task rgw-logsocket only supports a list or dictionary for configuration"
+    all_clients = ['client.{id}'.format(id=id_)
+                   for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    if config is None:
+        config = all_clients
+    if isinstance(config, list):
+        config = dict.fromkeys(config)
+    clients = config.keys()
+
+    overrides = ctx.config.get('overrides', {})
+    # merge each client section, not the top level.
+    for (client, cconf) in config.items():
+        teuthology.deep_merge(cconf, overrides.get('rgw-logsocket', {}))
+
+    log.debug('config is %s', config)
+
+    s3tests_conf = {}
+    for client in clients:
+        endpoint = ctx.rgw.role_endpoints.get(client)
+        assert endpoint, 'rgw-logsocket: no rgw endpoint for {}'.format(client)
+
+        s3tests_conf[client] = ConfigObj(
+            indent_type='',
+            infile={
+                'DEFAULT':
+                    {
+                    'port'      : endpoint.port,
+                    'is_secure' : endpoint.cert is not None,
+                    },
+                'fixtures' : {},
+                's3 main'  : {},
+                's3 alt'   : {},
+                }
+            )
+
+    with contextutil.nested(
+        lambda: download(ctx=ctx, config=config),
+        lambda: create_users(ctx=ctx, config=dict(
+                clients=clients,
+                s3tests_conf=s3tests_conf,
+                )),
+        lambda: configure(ctx=ctx, config=dict(
+                clients=config,
+                s3tests_conf=s3tests_conf,
+                )),
+        lambda: run_tests(ctx=ctx, config=config),
+        ):
+        yield
diff --git a/qa/tasks/rgw_multi b/qa/tasks/rgw_multi
new file mode 120000
index 000000000..abfc703b9
--- /dev/null
+++ b/qa/tasks/rgw_multi
@@ -0,0 +1 @@
+../../src/test/rgw/rgw_multi
+\ No newline at end of file
diff --git a/qa/tasks/rgw_multisite.py b/qa/tasks/rgw_multisite.py
new file mode 100644
index 000000000..266d0fb69
--- /dev/null
+++ b/qa/tasks/rgw_multisite.py
@@ -0,0 +1,436 @@
+"""
+rgw multisite configuration routines
+"""
+import argparse
+import logging
+import random
+import string
+from copy import deepcopy
+from tasks.util.rgw import rgwadmin, wait_for_radosgw
+from tasks.util.rados import create_ec_pool, create_replicated_pool
+from tasks.rgw_multi import multisite
+from tasks.rgw_multi.zone_rados import RadosZone as RadosZone
+from tasks.rgw_multi.zone_ps import PSZone as PSZone
+
+from teuthology.orchestra import run
+from teuthology import misc
+from teuthology.exceptions import ConfigError
+from teuthology.task import Task
+
+log = logging.getLogger(__name__)
+
+class RGWMultisite(Task):
+    """
+    Performs rgw multisite configuration to match the given realm definition.
+
+        - rgw-multisite:
+            realm:
+              name: test-realm
+              is_default: true
+
+    List one or more zonegroup definitions. These are provided as json
+    input to `radosgw-admin zonegroup set`, with the exception of these keys:
+
+    * 'is_master' is passed on the command line as --master
+    * 'is_default' is passed on the command line as --default
+    * 'is_pubsub' is used to create a zone with tier-type=pubsub
+    * 'endpoints' given as client names are replaced with actual endpoints
+
+            zonegroups:
+              - name: test-zonegroup
+                api_name: test-api
+                is_master: true
+                is_default: true
+                endpoints: [c1.client.0]
+
+    List each of the zones to be created in this zonegroup.
+
+                zones:
+                  - name: test-zone1
+                    is_master: true
+                    is_default: true
+                    endpoints: [c1.client.0]
+                  - name: test-zone2
+                    is_default: true
+                    endpoints: [c2.client.0]
+
+    A complete example:
+
+        tasks:
+        - install:
+        - ceph: {cluster: c1}
+        - ceph: {cluster: c2}
+        - rgw:
+            c1.client.0:
+            c2.client.0:
+        - rgw-multisite:
+            realm:
+              name: test-realm
+              is_default: true
+            zonegroups:
+              - name: test-zonegroup
+                is_master: true
+                is_default: true
+                zones:
+                  - name: test-zone1
+                    is_master: true
+                    is_default: true
+                    endpoints: [c1.client.0]
+                  - name: test-zone2
+                    is_default: true
+                    endpoints: [c2.client.0]
+                  - name: test-zone3
+                    is_pubsub: true
+                    endpoints: [c1.client.1]
+
+    """
+    def __init__(self, ctx, config):
+        super(RGWMultisite, self).__init__(ctx, config)
+
+    def setup(self):
+        super(RGWMultisite, self).setup()
+
+        overrides = self.ctx.config.get('overrides', {})
+        misc.deep_merge(self.config, overrides.get('rgw-multisite', {}))
+
+        if not self.ctx.rgw:
+            raise ConfigError('rgw-multisite must run after the rgw task')
+        role_endpoints = self.ctx.rgw.role_endpoints
+
+        # construct Clusters and Gateways for each client in the rgw task
+        clusters, gateways = extract_clusters_and_gateways(self.ctx,
+                                                           role_endpoints)
+
+        # get the master zone and zonegroup configuration
+        mz, mzg = extract_master_zone_zonegroup(self.config['zonegroups'])
+        cluster1 = cluster_for_zone(clusters, mz)
+
+        # create the realm and period on the master zone's cluster
+        log.info('creating realm..')
+        realm = create_realm(cluster1, self.config['realm'])
+        period = realm.current_period
+
+        creds = gen_credentials()
+
+        # create the master zonegroup and its master zone
+        log.info('creating master zonegroup..')
+        master_zonegroup = create_zonegroup(cluster1, gateways, period,
+                                            deepcopy(mzg))
+        period.master_zonegroup = master_zonegroup
+
+        log.info('creating master zone..')
+        master_zone = create_zone(self.ctx, cluster1, gateways, creds,
+                                  master_zonegroup, deepcopy(mz))
+        master_zonegroup.master_zone = master_zone
+
+        period.update(master_zone, commit=True)
+        restart_zone_gateways(master_zone) # restart with --rgw-zone
+
+        # create the admin user on the master zone
+        log.info('creating admin user..')
+        user_args = ['--display-name', 'Realm Admin', '--system']
+        user_args += creds.credential_args()
+        admin_user = multisite.User('realm-admin')
+        admin_user.create(master_zone, user_args)
+
+        # process 'zonegroups'
+        for zg_config in self.config['zonegroups']:
+            zones_config = zg_config.pop('zones')
+
+            zonegroup = None
+            for zone_config in zones_config:
+                # get the cluster for this zone
+                cluster = cluster_for_zone(clusters, zone_config)
+
+                if cluster != cluster1: # already created on master cluster
+                    log.info('pulling realm configuration to %s', cluster.name)
+                    realm.pull(cluster, master_zone.gateways[0], creds)
+
+                # use the first zone's cluster to create the zonegroup
+                if not zonegroup:
+                    if zg_config['name'] == master_zonegroup.name:
+                        zonegroup = master_zonegroup
+                    else:
+                        log.info('creating zonegroup..')
+                        zonegroup = create_zonegroup(cluster, gateways,
+                                                     period, zg_config)
+
+                if zone_config['name'] == master_zone.name:
+                    # master zone was already created
+                    zone = master_zone
+                else:
+                    # create the zone and commit the period
+                    log.info('creating zone..')
+                    zone = create_zone(self.ctx, cluster, gateways, creds,
+                                       zonegroup, zone_config)
+                    period.update(zone, commit=True)
+
+                    restart_zone_gateways(zone) # restart with --rgw-zone
+
+        # attach configuration to the ctx for other tasks
+        self.ctx.rgw_multisite = argparse.Namespace()
+        self.ctx.rgw_multisite.clusters = clusters
+        self.ctx.rgw_multisite.gateways = gateways
+        self.ctx.rgw_multisite.realm = realm
+        self.ctx.rgw_multisite.admin_user = admin_user
+
+        log.info('rgw multisite configuration completed')
+
+    def end(self):
+        del self.ctx.rgw_multisite
+
+class Cluster(multisite.Cluster):
+    """ Issues 'radosgw-admin' commands with the rgwadmin() helper """
+    def __init__(self, ctx, name, client):
+        super(Cluster, self).__init__()
+        self.ctx = ctx
+        self.name = name
+        self.client = client
+
+    def admin(self, args = None, **kwargs):
+        """ radosgw-admin command """
+        args = args or []
+        args += ['--cluster', self.name]
+        args += ['--debug-rgw', str(kwargs.pop('debug_rgw', 0))]
+        args += ['--debug-ms', str(kwargs.pop('debug_ms', 0))]
+        if kwargs.pop('read_only', False):
+            args += ['--rgw-cache-enabled', 'false']
+        kwargs['decode'] = False
+        check_retcode = kwargs.pop('check_retcode', True)
+        r, s = rgwadmin(self.ctx, self.client, args, **kwargs)
+        if check_retcode:
+            assert r == 0
+        return s, r
+
+class Gateway(multisite.Gateway):
+    """ Controls a radosgw instance using its daemon """
+    def __init__(self, role, remote, daemon, *args, **kwargs):
+        super(Gateway, self).__init__(*args, **kwargs)
+        self.role = role
+        self.remote = remote
+        self.daemon = daemon
+
+    def set_zone(self, zone):
+        """ set the zone and add its args to the daemon's command line """
+        assert self.zone is None, 'zone can only be set once'
+        self.zone = zone
+        # daemon.restart_with_args() would be perfect for this, except that
+        # radosgw args likely include a pipe and redirect. zone arguments at
+        # the end won't actually apply to radosgw
+        args = self.daemon.command_kwargs.get('args', [])
+        try:
+            # insert zone args before the first |
+            pipe = args.index(run.Raw('|'))
+            args = args[0:pipe] + zone.zone_args() + args[pipe:]
+        except ValueError:
+            args += zone.zone_args()
+        self.daemon.command_kwargs['args'] = args
+
+    def start(self, args = None):
+        """ (re)start the daemon """
+        self.daemon.restart()
+        # wait until startup completes
+        wait_for_radosgw(self.endpoint(), self.remote)
+
+    def stop(self):
+        """ stop the daemon """
+        self.daemon.stop()
+
+def extract_clusters_and_gateways(ctx, role_endpoints):
+    """ create cluster and gateway instances for all of the radosgw roles """
+    clusters = {}
+    gateways = {}
+    for role, endpoint in role_endpoints.items():
+        cluster_name, daemon_type, client_id = misc.split_role(role)
+        # find or create the cluster by name
+        cluster = clusters.get(cluster_name)
+        if not cluster:
+            clusters[cluster_name] = cluster = Cluster(ctx, cluster_name, role)
+        # create a gateway for this daemon
+        client_with_id = daemon_type + '.' + client_id # match format from rgw.py
+        daemon = ctx.daemons.get_daemon('rgw', client_with_id, cluster_name)
+        if not daemon:
+            raise ConfigError('no daemon for role=%s cluster=%s type=rgw id=%s' % \
+                              (role, cluster_name, client_id))
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+        gateways[role] = Gateway(role, remote, daemon, endpoint.hostname,
+                endpoint.port, cluster)
+    return clusters, gateways
+
+def create_realm(cluster, config):
+    """ create a realm from configuration and initialize its first period """
+    realm = multisite.Realm(config['name'])
+    args = []
+    if config.get('is_default', False):
+        args += ['--default']
+    realm.create(cluster, args)
+    realm.current_period = multisite.Period(realm)
+    return realm
+
+def extract_user_credentials(config):
+    """ extract keys from configuration """
+    return multisite.Credentials(config['access_key'], config['secret_key'])
+
+def extract_master_zone(zonegroup_config):
+    """ find and return the master zone definition """
+    master = None
+    for zone in zonegroup_config['zones']:
+        if not zone.get('is_master', False):
+            continue
+        if master:
+            raise ConfigError('zones %s and %s cannot both set \'is_master\'' % \
+                              (master['name'], zone['name']))
+        master = zone
+        # continue the loop so we can detect duplicates
+    if not master:
+        raise ConfigError('one zone must set \'is_master\' in zonegroup %s' % \
+                          zonegroup_config['name'])
+    return master
+
+def extract_master_zone_zonegroup(zonegroups_config):
+    """ find and return the master zone and zonegroup definitions """
+    master_zone, master_zonegroup = (None, None)
+    for zonegroup in zonegroups_config:
+        # verify that all zonegroups have a master zone set, even if they
+        # aren't in the master zonegroup
+        zone = extract_master_zone(zonegroup)
+        if not zonegroup.get('is_master', False):
+            continue
+        if master_zonegroup:
+            raise ConfigError('zonegroups %s and %s cannot both set \'is_master\'' % \
+                              (master_zonegroup['name'], zonegroup['name']))
+        master_zonegroup = zonegroup
+        master_zone = zone
+        # continue the loop so we can detect duplicates
+    if not master_zonegroup:
+        raise ConfigError('one zonegroup must set \'is_master\'')
+    return master_zone, master_zonegroup
+
+def extract_zone_cluster_name(zone_config):
+    """ return the cluster (must be common to all zone endpoints) """
+    cluster_name = None
+    endpoints = zone_config.get('endpoints')
+    if not endpoints:
+        raise ConfigError('zone %s missing \'endpoints\' list' % \
+                          zone_config['name'])
+    for role in endpoints:
+        name, _, _ = misc.split_role(role)
+        if not cluster_name:
+            cluster_name = name
+        elif cluster_name != name:
+            raise ConfigError('all zone %s endpoints must be in the same cluster' % \
+                              zone_config['name'])
+    return cluster_name
+
+def cluster_for_zone(clusters, zone_config):
+    """ return the cluster entry for the given zone """
+    name = extract_zone_cluster_name(zone_config)
+    try:
+        return clusters[name]
+    except KeyError:
+        raise ConfigError('no cluster %s found' % name)
+
+def gen_access_key():
+    return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(16))
+
+def gen_secret():
+    return ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(32))
+
+def gen_credentials():
+    return multisite.Credentials(gen_access_key(), gen_secret())
+
+def extract_gateway_endpoints(gateways, endpoints_config):
+    """ return a list of gateway endpoints associated with the given roles """
+    endpoints = []
+    for role in endpoints_config:
+        try:
+            # replace role names with their gateway's endpoint
+            endpoints.append(gateways[role].endpoint())
+        except KeyError:
+            raise ConfigError('no radosgw endpoint found for role %s' % role)
+    return endpoints
+
+def is_default_arg(config):
+    return ['--default'] if config.pop('is_default', False) else []
+
+def is_master_arg(config):
+    return ['--master'] if config.pop('is_master', False) else []
+
+def create_zonegroup(cluster, gateways, period, config):
+    """ pass the zonegroup configuration to `zonegroup set` """
+    config.pop('zones', None) # remove 'zones' from input to `zonegroup set`
+    endpoints = config.get('endpoints')
+    if endpoints:
+        # replace client names with their gateway endpoints
+        config['endpoints'] = extract_gateway_endpoints(gateways, endpoints)
+    zonegroup = multisite.ZoneGroup(config['name'], period)
+    # `zonegroup set` needs --default on command line, and 'is_master' in json
+    args = is_default_arg(config)
+    zonegroup.set(cluster, config, args)
+    period.zonegroups.append(zonegroup)
+    return zonegroup
+
+def create_zone(ctx, cluster, gateways, creds, zonegroup, config):
+    """ create a zone with the given configuration """
+    zone = multisite.Zone(config['name'], zonegroup, cluster)
+    if config.pop('is_pubsub', False):
+        zone = PSZone(config['name'], zonegroup, cluster)
+    else:
+        zone = RadosZone(config['name'], zonegroup, cluster)
+
+    # collect Gateways for the zone's endpoints
+    endpoints = config.get('endpoints')
+    if not endpoints:
+        raise ConfigError('no \'endpoints\' for zone %s' % config['name'])
+    zone.gateways = [gateways[role] for role in endpoints]
+    for gateway in zone.gateways:
+        gateway.set_zone(zone)
+
+    # format the gateway endpoints
+    endpoints = [g.endpoint() for g in zone.gateways]
+
+    args = is_default_arg(config)
+    args += is_master_arg(config)
+    args += creds.credential_args()
+    if len(endpoints):
+        args += ['--endpoints', ','.join(endpoints)]
+    zone.create(cluster, args)
+    zonegroup.zones.append(zone)
+
+    create_zone_pools(ctx, zone)
+    if ctx.rgw.compression_type:
+        configure_zone_compression(zone, ctx.rgw.compression_type)
+
+    zonegroup.zones_by_type.setdefault(zone.tier_type(), []).append(zone)
+
+    if zone.is_read_only():
+        zonegroup.ro_zones.append(zone)
+    else:
+        zonegroup.rw_zones.append(zone)
+
+    return zone
+
+def create_zone_pools(ctx, zone):
+    """ Create the data_pool for each placement type """
+    gateway = zone.gateways[0]
+    cluster = zone.cluster
+    for pool_config in zone.data.get('placement_pools', []):
+        pool_name = pool_config['val']['storage_classes']['STANDARD']['data_pool']
+        if ctx.rgw.ec_data_pool:
+            create_ec_pool(gateway.remote, pool_name, zone.name, 64,
+                           ctx.rgw.erasure_code_profile, cluster.name, 'rgw')
+        else:
+            create_replicated_pool(gateway.remote, pool_name, 64, cluster.name, 'rgw')
+
+def configure_zone_compression(zone, compression):
+    """ Set compression type in the zone's default-placement """
+    zone.json_command(zone.cluster, 'placement', ['modify',
+                          '--placement-id', 'default-placement',
+                          '--compression', compression
+                      ])
+
+def restart_zone_gateways(zone):
+    zone.stop()
+    zone.start()
+
+task = RGWMultisite
diff --git a/qa/tasks/rgw_multisite_tests.py b/qa/tasks/rgw_multisite_tests.py
new file mode 100644
index 000000000..53aedf792
--- /dev/null
+++ b/qa/tasks/rgw_multisite_tests.py
@@ -0,0 +1,99 @@
+"""
+rgw multisite testing
+"""
+import logging
+import nose.core
+import nose.config
+
+from teuthology.exceptions import ConfigError
+from teuthology.task import Task
+from teuthology import misc
+
+from tasks.rgw_multi import multisite, tests, tests_ps
+
+log = logging.getLogger(__name__)
+
+
+class RGWMultisiteTests(Task):
+    """
+    Runs the rgw_multi tests against a multisite configuration created by the
+    rgw-multisite task. Tests are run with nose, using any additional 'args'
+    provided. Overrides for tests.Config can be set in 'config'.
+
+        - rgw-multisite-tests:
+            args:
+            - tasks.rgw_multi.tests:test_object_sync
+            config:
+              reconfigure_delay: 60
+
+    """
+    def __init__(self, ctx, config):
+        super(RGWMultisiteTests, self).__init__(ctx, config)
+
+    def setup(self):
+        super(RGWMultisiteTests, self).setup()
+
+        overrides = self.ctx.config.get('overrides', {})
+        misc.deep_merge(self.config, overrides.get('rgw-multisite-tests', {}))
+
+        if not self.ctx.rgw_multisite:
+            raise ConfigError('rgw-multisite-tests must run after the rgw-multisite task')
+        realm = self.ctx.rgw_multisite.realm
+        master_zone = realm.meta_master_zone()
+
+        # create the test user
+        log.info('creating test user..')
+        user = multisite.User('rgw-multisite-test-user')
+        user.create(master_zone, ['--display-name', 'Multisite Test User',
+                                  '--gen-access-key', '--gen-secret'])
+
+        config = self.config.get('config', {})
+        tests.init_multi(realm, user, tests.Config(**config))
+        tests.realm_meta_checkpoint(realm)
+
+    def begin(self):
+        # extra arguments for nose can be passed as a string or list
+        extra_args = self.config.get('args', [])
+        if not isinstance(extra_args, list):
+            extra_args = [extra_args]
+        argv = [__name__] + extra_args
+
+        log.info("running rgw multisite tests on '%s' with args=%r",
+                 tests.__name__, extra_args)
+
+        # run nose tests in the rgw_multi.tests module
+        conf = nose.config.Config(stream=get_log_stream(), verbosity=2)
+        error_msg = ''
+        result = nose.run(defaultTest=tests.__name__, argv=argv, config=conf)
+        if not result:
+            error_msg += 'rgw multisite, '
+        result = nose.run(defaultTest=tests_ps.__name__, argv=argv, config=conf)
+        if not result:
+            error_msg += 'rgw multisite pubsub, '
+        if error_msg:
+            raise RuntimeError(error_msg + 'test failures')
+
+
+def get_log_stream():
+    """ return a log stream for nose output """
+    # XXX: this is a workaround for IOErrors when nose writes to stderr,
+    # copied from vstart_runner.py
+    class LogStream(object):
+        def __init__(self):
+            self.buffer = ""
+
+        def write(self, data):
+            self.buffer += data
+            if "\n" in self.buffer:
+                lines = self.buffer.split("\n")
+                for line in lines[:-1]:
+                    log.info(line)
+                self.buffer = lines[-1]
+
+        def flush(self):
+            pass
+
+    return LogStream()
+
+
+task = RGWMultisiteTests
diff --git a/qa/tasks/rook-ceph.conf b/qa/tasks/rook-ceph.conf
new file mode 100644
index 000000000..38ac11e41
--- /dev/null
+++ b/qa/tasks/rook-ceph.conf
@@ -0,0 +1,41 @@
+[global]
+
+log to file = true
+
+mon clock drift allowed = 1.000
+
+# replicate across OSDs, not hosts
+osd crush chooseleaf type = 0
+
+# enable some debugging
+auth debug = true
+ms die on old message = true
+ms die on bug = true
+debug asserts on shutdown = true
+
+
+[osd]
+# debugging
+osd debug shutdown = true
+osd debug op order = true
+osd debug verify stray on activate = true
+osd debug pg log writeout = true
+osd debug verify cached snaps = true
+osd debug verify missing on start = true
+osd debug misdirected ops = true
+osd op queue = debug_random
+osd op queue cut off = debug_random
+osd shutdown pgref assert = true
+bdev debug aio = true
+osd sloppy crc = true
+
+
+[mon]
+# rotate auth tickets quickly to exercise renewal paths
+auth mon ticket ttl = 660      # 11m
+auth service ticket ttl = 240  # 4m
+
+# don't complain about global id reclaim
+mon_warn_on_insecure_global_id_reclaim = false
+mon_warn_on_insecure_global_id_reclaim_allowed = false
+
diff --git a/qa/tasks/rook.py b/qa/tasks/rook.py
new file mode 100644
index 000000000..bdd9e58dc
--- /dev/null
+++ b/qa/tasks/rook.py
@@ -0,0 +1,638 @@
+"""
+Rook cluster task
+"""
+import argparse
+import configobj
+import contextlib
+import json
+import logging
+import os
+import yaml
+from io import BytesIO
+
+from tarfile import ReadError
+from tasks.ceph_manager import CephManager
+from teuthology import misc as teuthology
+from teuthology.config import config as teuth_config
+from teuthology.contextutil import safe_while
+from teuthology.orchestra import run
+from teuthology import contextutil
+from tasks.ceph import healthy
+from tasks.cephadm import update_archive_setting
+
+log = logging.getLogger(__name__)
+
+
+def _kubectl(ctx, config, args, **kwargs):
+    cluster_name = config.get('cluster', 'ceph')
+    return ctx.rook[cluster_name].remote.run(
+        args=['kubectl'] + args,
+        **kwargs
+    )
+
+
+def shell(ctx, config):
+    """
+    Run command(s) inside the rook tools container.
+
+      tasks:
+      - kubeadm:
+      - rook:
+      - rook.shell:
+          - ceph -s
+
+    or
+
+      tasks:
+      - kubeadm:
+      - rook:
+      - rook.shell:
+          commands:
+          - ceph -s
+
+    """
+    if isinstance(config, list):
+        config = {'commands': config}
+    for cmd in config.get('commands', []):
+        if isinstance(cmd, str):
+            _shell(ctx, config, cmd.split(' '))
+        else:
+            _shell(ctx, config, cmd)
+
+
+def _shell(ctx, config, args, **kwargs):
+    cluster_name = config.get('cluster', 'ceph')
+    return _kubectl(
+        ctx, config,
+        [
+            '-n', 'rook-ceph',
+            'exec',
+            ctx.rook[cluster_name].toolbox, '--'
+        ] + args,
+        **kwargs
+    )
+
+
+@contextlib.contextmanager
+def rook_operator(ctx, config):
+    cluster_name = config['cluster']
+    rook_branch = config.get('rook_branch', 'master')
+    rook_git_url = config.get('rook_git_url', 'https://github.com/rook/rook')
+
+    log.info(f'Cloning {rook_git_url} branch {rook_branch}')
+    ctx.rook[cluster_name].remote.run(
+        args=[
+            'rm', '-rf', 'rook',
+            run.Raw('&&'),
+            'git',
+            'clone',
+            '--single-branch',
+            '--branch', rook_branch,
+            rook_git_url,
+            'rook',
+        ]
+    )
+
+    # operator.yaml
+    operator_yaml = ctx.rook[cluster_name].remote.read_file(
+        'rook/cluster/examples/kubernetes/ceph/operator.yaml'
+    )
+    rook_image = config.get('rook_image')
+    if rook_image:
+        log.info(f'Patching operator to use image {rook_image}')
+        crs = list(yaml.load_all(operator_yaml, Loader=yaml.FullLoader))
+        assert len(crs) == 2
+        crs[1]['spec']['template']['spec']['containers'][0]['image'] = rook_image
+        operator_yaml = yaml.dump_all(crs)
+    ctx.rook[cluster_name].remote.write_file('operator.yaml', operator_yaml)
+
+    op_job = None
+    try:
+        log.info('Deploying operator')
+        _kubectl(ctx, config, [
+            'create',
+            '-f', 'rook/cluster/examples/kubernetes/ceph/crds.yaml',
+            '-f', 'rook/cluster/examples/kubernetes/ceph/common.yaml',
+            '-f', 'operator.yaml',
+        ])
+
+        # on centos:
+        if teuthology.get_distro(ctx) == 'centos':
+            _kubectl(ctx, config, [
+                '-n', 'rook-ceph',
+                'set', 'env', 'deploy/rook-ceph-operator',
+                'ROOK_HOSTPATH_REQUIRES_PRIVILEGED=true'
+            ])
+
+        # wait for operator
+        op_name = None
+        with safe_while(sleep=10, tries=90, action="wait for operator") as proceed:
+            while not op_name and proceed():
+                p = _kubectl(
+                    ctx, config,
+                    ['-n', 'rook-ceph', 'get', 'pods', '-l', 'app=rook-ceph-operator'],
+                    stdout=BytesIO(),
+                )
+                for line in p.stdout.getvalue().decode('utf-8').strip().splitlines():
+                    name, ready, status, _ = line.split(None, 3)
+                    if status == 'Running':
+                        op_name = name
+                        break
+
+        # log operator output
+        op_job = _kubectl(
+            ctx,
+            config,
+            ['-n', 'rook-ceph', 'logs', '-f', op_name],
+            wait=False,
+            logger=log.getChild('operator'),
+        )
+
+        yield
+
+    except Exception as e:
+        log.exception(e)
+        raise
+
+    finally:
+        log.info('Cleaning up rook operator')
+        _kubectl(ctx, config, [
+            'delete',
+            '-f', 'operator.yaml',
+        ])
+        if False:
+            # don't bother since we'll tear down k8s anyway (and this mysteriously
+            # fails sometimes when deleting some of the CRDs... not sure why!)
+            _kubectl(ctx, config, [
+                'delete',
+                '-f', 'rook/cluster/examples/kubernetes/ceph/common.yaml',
+            ])
+            _kubectl(ctx, config, [
+                'delete',
+                '-f', 'rook/cluster/examples/kubernetes/ceph/crds.yaml',
+            ])
+        ctx.rook[cluster_name].remote.run(args=['rm', '-rf', 'rook', 'operator.yaml'])
+        if op_job:
+            op_job.wait()
+        run.wait(
+            ctx.cluster.run(
+                args=[
+                    'sudo', 'rm', '-rf', '/var/lib/rook'
+                ]
+            )
+        )
+
+
+@contextlib.contextmanager
+def ceph_log(ctx, config):
+    cluster_name = config['cluster']
+
+    log_dir = '/var/lib/rook/rook-ceph/log'
+    update_archive_setting(ctx, 'log', log_dir)
+
+    try:
+        yield
+
+    except Exception:
+        # we need to know this below
+        ctx.summary['success'] = False
+        raise
+
+    finally:
+        log.info('Checking cluster log for badness...')
+        def first_in_ceph_log(pattern, excludes):
+            """
+            Find the first occurrence of the pattern specified in the Ceph log,
+            Returns None if none found.
+
+            :param pattern: Pattern scanned for.
+            :param excludes: Patterns to ignore.
+            :return: First line of text (or None if not found)
+            """
+            args = [
+                'sudo',
+                'egrep', pattern,
+                f'{log_dir}/ceph.log',
+            ]
+            if excludes:
+                for exclude in excludes:
+                    args.extend([run.Raw('|'), 'egrep', '-v', exclude])
+            args.extend([
+                run.Raw('|'), 'head', '-n', '1',
+            ])
+            r = ctx.rook[cluster_name].remote.run(
+                stdout=BytesIO(),
+                args=args,
+            )
+            stdout = r.stdout.getvalue().decode()
+            if stdout:
+                return stdout
+            return None
+
+        if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
+                             config.get('log-ignorelist')) is not None:
+            log.warning('Found errors (ERR|WRN|SEC) in cluster log')
+            ctx.summary['success'] = False
+            # use the most severe problem as the failure reason
+            if 'failure_reason' not in ctx.summary:
+                for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
+                    match = first_in_ceph_log(pattern, config['log-ignorelist'])
+                    if match is not None:
+                        ctx.summary['failure_reason'] = \
+                            '"{match}" in cluster log'.format(
+                                match=match.rstrip('\n'),
+                            )
+                        break
+
+        if ctx.archive is not None and \
+                not (ctx.config.get('archive-on-error') and ctx.summary['success']):
+            # and logs
+            log.info('Compressing logs...')
+            run.wait(
+                ctx.cluster.run(
+                    args=[
+                        'sudo',
+                        'find',
+                        log_dir,
+                        '-name',
+                        '*.log',
+                        '-print0',
+                        run.Raw('|'),
+                        'sudo',
+                        'xargs',
+                        '-0',
+                        '--no-run-if-empty',
+                        '--',
+                        'gzip',
+                        '--',
+                    ],
+                    wait=False,
+                ),
+            )
+
+            log.info('Archiving logs...')
+            path = os.path.join(ctx.archive, 'remote')
+            try:
+                os.makedirs(path)
+            except OSError:
+                pass
+            for remote in ctx.cluster.remotes.keys():
+                sub = os.path.join(path, remote.name)
+                try:
+                    os.makedirs(sub)
+                except OSError:
+                    pass
+                try:
+                    teuthology.pull_directory(remote, log_dir,
+                                              os.path.join(sub, 'log'))
+                except ReadError:
+                    pass
+
+
+def build_initial_config(ctx, config):
+    path = os.path.join(os.path.dirname(__file__), 'rook-ceph.conf')
+    conf = configobj.ConfigObj(path, file_error=True)
+
+    # overrides
+    for section, keys in config.get('conf',{}).items():
+        for key, value in keys.items():
+            log.info(" override: [%s] %s = %s" % (section, key, value))
+            if section not in conf:
+                conf[section] = {}
+            conf[section][key] = value
+
+    return conf
+
+
+@contextlib.contextmanager
+def rook_cluster(ctx, config):
+    cluster_name = config['cluster']
+
+    # count how many OSDs we'll create
+    num_devs = 0
+    num_hosts = 0
+    for remote in ctx.cluster.remotes.keys():
+        ls = remote.read_file('/scratch_devs').decode('utf-8').strip().splitlines()
+        num_devs += len(ls)
+        num_hosts += 1
+    ctx.rook[cluster_name].num_osds = num_devs
+
+    # config
+    config = build_initial_config(ctx, config)
+    config_fp = BytesIO()
+    config.write(config_fp)
+    log.info(f'Config:\n{config_fp.getvalue()}')
+    _kubectl(ctx, config, ['create', '-f', '-'], stdin=yaml.dump({
+        'apiVersion': 'v1',
+        'kind': 'ConfigMap',
+        'metadata': {
+            'name': 'rook-config-override',
+            'namespace': 'rook-ceph'},
+        'data': {
+            'config': config_fp.getvalue()
+        }
+    }))
+
+    # cluster
+    cluster = {
+        'apiVersion': 'ceph.rook.io/v1',
+        'kind': 'CephCluster',
+        'metadata': {'name': 'rook-ceph', 'namespace': 'rook-ceph'},
+        'spec': {
+            'cephVersion': {
+                'image': ctx.rook[cluster_name].image,
+                'allowUnsupported': True,
+            },
+            'dataDirHostPath': '/var/lib/rook',
+            'skipUpgradeChecks': True,
+            'mgr': {
+                'count': 1,
+                'modules': [
+                    { 'name': 'rook', 'enabled': True },
+                ],
+            },
+            'mon': {
+                'count': num_hosts,
+                'allowMultiplePerNode': True,
+            },
+            'storage': {
+                'storageClassDeviceSets': [
+                    {
+                        'name': 'scratch',
+                        'count': num_devs,
+                        'portable': False,
+                        'volumeClaimTemplates': [
+                            {
+                                'metadata': {'name': 'data'},
+                                'spec': {
+                                    'resources': {
+                                        'requests': {
+                                            'storage': '10Gi'  # <= (lte) the actual PV size
+                                        }
+                                    },
+                                    'storageClassName': 'scratch',
+                                    'volumeMode': 'Block',
+                                    'accessModes': ['ReadWriteOnce'],
+                                },
+                            },
+                        ],
+                    }
+                ],
+            },
+        }
+    }
+    teuthology.deep_merge(cluster['spec'], config.get('spec', {}))
+    
+    cluster_yaml = yaml.dump(cluster)
+    log.info(f'Cluster:\n{cluster_yaml}')
+    try:
+        ctx.rook[cluster_name].remote.write_file('cluster.yaml', cluster_yaml)
+        _kubectl(ctx, config, ['create', '-f', 'cluster.yaml'])
+        yield
+
+    except Exception as e:
+        log.exception(e)
+        raise
+
+    finally:
+        _kubectl(ctx, config, ['delete', '-f', 'cluster.yaml'], check_status=False)
+
+        # wait for cluster to shut down
+        log.info('Waiting for cluster to stop')
+        running = True
+        with safe_while(sleep=5, tries=100, action="wait for teardown") as proceed:
+            while running and proceed():
+                p = _kubectl(
+                    ctx, config,
+                    ['-n', 'rook-ceph', 'get', 'pods'],
+                    stdout=BytesIO(),
+                )
+                running = False
+                for line in p.stdout.getvalue().decode('utf-8').strip().splitlines():
+                    name, ready, status, _ = line.split(None, 3)
+                    if (
+                            name != 'NAME'
+                            and not name.startswith('csi-')
+                            and not name.startswith('rook-ceph-operator-')
+                            and not name.startswith('rook-ceph-tools-')
+                    ):
+                        running = True
+                        break
+
+        _kubectl(
+            ctx, config,
+            ['-n', 'rook-ceph', 'delete', 'configmap', 'rook-config-override'],
+            check_status=False,
+        )
+        ctx.rook[cluster_name].remote.run(args=['rm', '-f', 'cluster.yaml'])
+
+
+@contextlib.contextmanager
+def rook_toolbox(ctx, config):
+    cluster_name = config['cluster']
+    try:
+        _kubectl(ctx, config, [
+            'create',
+            '-f', 'rook/cluster/examples/kubernetes/ceph/toolbox.yaml',
+        ])
+
+        log.info('Waiting for tools container to start')
+        toolbox = None
+        with safe_while(sleep=5, tries=100, action="wait for toolbox") as proceed:
+            while not toolbox and proceed():
+                p = _kubectl(
+                    ctx, config,
+                    ['-n', 'rook-ceph', 'get', 'pods', '-l', 'app=rook-ceph-tools'],
+                    stdout=BytesIO(),
+                )
+                for line in p.stdout.getvalue().decode('utf-8').strip().splitlines():
+                    name, ready, status, _ = line.split(None, 3)
+                    if status == 'Running':
+                        toolbox = name
+                        break
+        ctx.rook[cluster_name].toolbox = toolbox
+        yield
+
+    except Exception as e:
+        log.exception(e)
+        raise
+
+    finally:
+        _kubectl(ctx, config, [
+            'delete',
+            '-f', 'rook/cluster/examples/kubernetes/ceph/toolbox.yaml',
+        ], check_status=False)
+
+
+@contextlib.contextmanager
+def wait_for_osds(ctx, config):
+    cluster_name = config.get('cluster', 'ceph')
+
+    want = ctx.rook[cluster_name].num_osds
+    log.info(f'Waiting for {want} OSDs')
+    with safe_while(sleep=10, tries=90, action="check osd count") as proceed:
+        while proceed():
+            p = _shell(ctx, config, ['ceph', 'osd', 'stat', '-f', 'json'],
+                       stdout=BytesIO(),
+                       check_status=False)
+            if p.exitstatus == 0:
+                r = json.loads(p.stdout.getvalue().decode('utf-8'))
+                have = r.get('num_up_osds', 0)
+                if have == want:
+                    break
+                log.info(f' have {have}/{want} OSDs')
+
+    yield
+
+
+@contextlib.contextmanager
+def ceph_config_keyring(ctx, config):
+    # get config and push to hosts
+    log.info('Distributing ceph config and client.admin keyring')
+    p = _shell(ctx, config, ['cat', '/etc/ceph/ceph.conf'], stdout=BytesIO())
+    conf = p.stdout.getvalue()
+    p = _shell(ctx, config, ['cat', '/etc/ceph/keyring'], stdout=BytesIO())
+    keyring = p.stdout.getvalue()
+    ctx.cluster.run(args=['sudo', 'mkdir', '-p', '/etc/ceph'])
+    for remote in ctx.cluster.remotes.keys():
+        remote.write_file(
+            '/etc/ceph/ceph.conf',
+            conf,
+            sudo=True,
+        )
+        remote.write_file(
+            '/etc/ceph/keyring',
+            keyring,
+            sudo=True,
+        )
+
+    try:
+        yield
+
+    except Exception as e:
+        log.exception(e)
+        raise
+
+    finally:
+        log.info('Cleaning up config and client.admin keyring')
+        ctx.cluster.run(args=[
+            'sudo', 'rm', '-f',
+            '/etc/ceph/ceph.conf',
+            '/etc/ceph/ceph.client.admin.keyring'
+        ])
+
+
+@contextlib.contextmanager
+def ceph_clients(ctx, config):
+    cluster_name = config['cluster']
+
+    log.info('Setting up client nodes...')
+    clients = ctx.cluster.only(teuthology.is_type('client', cluster_name))
+    for remote, roles_for_host in clients.remotes.items():
+        for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
+                                                     cluster_name):
+            name = teuthology.ceph_role(role)
+            client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name,
+                                                                name)
+            r = _shell(ctx, config,
+                args=[
+                    'ceph', 'auth',
+                    'get-or-create', name,
+                    'mon', 'allow *',
+                    'osd', 'allow *',
+                    'mds', 'allow *',
+                    'mgr', 'allow *',
+                ],
+                stdout=BytesIO(),
+            )
+            keyring = r.stdout.getvalue()
+            remote.write_file(client_keyring, keyring, sudo=True, mode='0644')
+    yield
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Deploy rook-ceph cluster
+
+      tasks:
+      - kubeadm:
+      - rook:
+          branch: wip-foo
+          spec:
+            mon:
+              count: 1
+
+    The spec item is deep-merged against the cluster.yaml.  The branch, sha1, or
+    image items are used to determine the Ceph container image.
+    """
+    if not config:
+        config = {}
+    assert isinstance(config, dict), \
+        "task only supports a dictionary for configuration"
+
+    log.info('Rook start')
+
+    overrides = ctx.config.get('overrides', {})
+    teuthology.deep_merge(config, overrides.get('ceph', {}))
+    teuthology.deep_merge(config, overrides.get('rook', {}))
+    log.info('Config: ' + str(config))
+
+    # set up cluster context
+    if not hasattr(ctx, 'rook'):
+        ctx.rook = {}
+    if 'cluster' not in config:
+        config['cluster'] = 'ceph'
+    cluster_name = config['cluster']
+    if cluster_name not in ctx.rook:
+        ctx.rook[cluster_name] = argparse.Namespace()
+
+    ctx.rook[cluster_name].remote = list(ctx.cluster.remotes.keys())[0]
+
+    # image
+    teuth_defaults = teuth_config.get('defaults', {})
+    cephadm_defaults = teuth_defaults.get('cephadm', {})
+    containers_defaults = cephadm_defaults.get('containers', {})
+    container_image_name = containers_defaults.get('image', None)
+    if 'image' in config:
+        ctx.rook[cluster_name].image = config.get('image')
+    else:
+        sha1 = config.get('sha1')
+        flavor = config.get('flavor', 'default')
+        if sha1:
+            if flavor == "crimson":
+                ctx.rook[cluster_name].image = container_image_name + ':' + sha1 + '-' + flavor
+            else:
+                ctx.rook[cluster_name].image = container_image_name + ':' + sha1
+        else:
+            # hmm, fall back to branch?
+            branch = config.get('branch', 'master')
+            ctx.rook[cluster_name].image = container_image_name + ':' + branch
+    log.info('Ceph image is %s' % ctx.rook[cluster_name].image)
+    
+    with contextutil.nested(
+            lambda: rook_operator(ctx, config),
+            lambda: ceph_log(ctx, config),
+            lambda: rook_cluster(ctx, config),
+            lambda: rook_toolbox(ctx, config),
+            lambda: wait_for_osds(ctx, config),
+            lambda: ceph_config_keyring(ctx, config),
+            lambda: ceph_clients(ctx, config),
+    ):
+        if not hasattr(ctx, 'managers'):
+            ctx.managers = {}
+        ctx.managers[cluster_name] = CephManager(
+            ctx.rook[cluster_name].remote,
+            ctx=ctx,
+            logger=log.getChild('ceph_manager.' + cluster_name),
+            cluster=cluster_name,
+            rook=True,
+        )
+        try:
+            if config.get('wait-for-healthy', True):
+                healthy(ctx=ctx, config=config)
+            log.info('Rook complete, yielding')
+            yield
+
+        finally:
+            log.info('Tearing down rook')
diff --git a/qa/tasks/s3a_hadoop.py b/qa/tasks/s3a_hadoop.py
new file mode 100644
index 000000000..7b77359fc
--- /dev/null
+++ b/qa/tasks/s3a_hadoop.py
@@ -0,0 +1,285 @@
+import contextlib
+import logging
+from teuthology import misc
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+     Run Hadoop S3A tests using Ceph
+     usage:
+      -tasks:
+         ceph-ansible:
+         s3a-hadoop:
+           maven-version: '3.6.3' (default)
+           hadoop-version: '2.9.2'
+           bucket-name: 's3atest' (default)
+           access-key: 'anykey' (uses a default value)
+           secret-key: 'secretkey' ( uses a default value)
+           role: client.0
+    """
+    if config is None:
+        config = {}
+
+    assert isinstance(config, dict), \
+        "task only supports a dictionary for configuration"
+
+    assert hasattr(ctx, 'rgw'), 's3a-hadoop must run after the rgw task'
+
+    overrides = ctx.config.get('overrides', {})
+    misc.deep_merge(config, overrides.get('s3a-hadoop', {}))
+    testdir = misc.get_testdir(ctx)
+
+    role = config.get('role')
+    (remote,) = ctx.cluster.only(role).remotes.keys()
+    endpoint = ctx.rgw.role_endpoints.get(role)
+    assert endpoint, 's3tests: no rgw endpoint for {}'.format(role)
+
+    # get versions
+    maven_major = config.get('maven-major', 'maven-3')
+    maven_version = config.get('maven-version', '3.6.3')
+    hadoop_ver = config.get('hadoop-version', '2.9.2')
+    bucket_name = config.get('bucket-name', 's3atest')
+    access_key = config.get('access-key', 'EGAQRD2ULOIFKFSKCT4F')
+    secret_key = config.get(
+        'secret-key',
+        'zi816w1vZKfaSM85Cl0BxXTwSLyN7zB4RbTswrGb')
+
+    # set versions for cloning the repo
+    apache_maven = 'apache-maven-{maven_version}-bin.tar.gz'.format(
+        maven_version=maven_version)
+    maven_link = 'http://archive.apache.org/dist/maven/' + \
+        '{maven_major}/{maven_version}/binaries/'.format(maven_major=maven_major, maven_version=maven_version) + apache_maven
+    hadoop_git = 'https://github.com/apache/hadoop'
+    hadoop_rel = 'hadoop-{ver} rel/release-{ver}'.format(ver=hadoop_ver)
+    if hadoop_ver == 'trunk':
+        # just checkout a new branch out of trunk
+        hadoop_rel = 'hadoop-ceph-trunk'
+    install_prereq(remote)
+    remote.run(
+        args=[
+            'cd',
+            testdir,
+            run.Raw('&&'),
+            'wget',
+            maven_link,
+            run.Raw('&&'),
+            'tar',
+            '-xvf',
+            apache_maven,
+            run.Raw('&&'),
+            'git',
+            'clone',
+            run.Raw(hadoop_git),
+            run.Raw('&&'),
+            'cd',
+            'hadoop',
+            run.Raw('&&'),
+            'git',
+            'checkout',
+            '-b',
+            run.Raw(hadoop_rel)
+        ]
+    )
+    configure_s3a(remote, endpoint.dns_name, access_key, secret_key, bucket_name, testdir)
+    setup_user_bucket(remote, endpoint.dns_name, access_key, secret_key, bucket_name, testdir)
+    if hadoop_ver.startswith('2.8'):
+        # test all ITtests but skip AWS test using public bucket landsat-pds
+        # which is not available from within this test
+        test_options = '-Dit.test=ITestS3A* -Dparallel-tests -Dscale \
+                        -Dfs.s3a.scale.test.timeout=1200 \
+                        -Dfs.s3a.scale.test.huge.filesize=256M verify'
+    else:
+        test_options = 'test -Dtest=S3a*,TestS3A*'
+    try:
+        run_s3atest(remote, maven_version, testdir, test_options)
+        yield
+    finally:
+        log.info("Done s3a testing, Cleaning up")
+        for fil in ['apache*', 'hadoop*', 'venv*', 'create*']:
+            remote.run(args=['rm', run.Raw('-rf'), run.Raw('{tdir}/{file}'.format(tdir=testdir, file=fil))])
+
+
+def install_prereq(client):
+    """
+    Install pre requisites for RHEL and CentOS
+    TBD: Ubuntu
+    """
+    if client.os.name == 'rhel' or client.os.name == 'centos':
+        client.run(
+               args=[
+                    'sudo',
+                    'yum',
+                    'install',
+                    '-y',
+                    'protobuf-c.x86_64',
+                    'java',
+                    'java-1.8.0-openjdk-devel',
+                    'dnsmasq'
+                    ]
+                )
+
+
+def setup_user_bucket(client, dns_name, access_key, secret_key, bucket_name, testdir):
+    """
+    Create user with access_key and secret_key that will be
+    used for the s3a testdir
+    """
+    client.run(
+        args=[
+            'sudo',
+            'radosgw-admin',
+            'user',
+            'create',
+            run.Raw('--uid'),
+            's3a',
+            run.Raw('--display-name="s3a cephtests"'),
+            run.Raw('--access-key={access_key}'.format(access_key=access_key)),
+            run.Raw('--secret-key={secret_key}'.format(secret_key=secret_key)),
+            run.Raw('--email=s3a@ceph.com'),
+        ]
+    )
+    client.run(
+        args=[
+            'python3',
+            '-m',
+            'venv',
+            '{testdir}/venv'.format(testdir=testdir),
+            run.Raw('&&'),
+            run.Raw('{testdir}/venv/bin/pip'.format(testdir=testdir)),
+            'install',
+            'boto'
+        ]
+    )
+    create_bucket = """
+#!/usr/bin/env python
+import boto
+import boto.s3.connection
+access_key = '{access_key}'
+secret_key = '{secret_key}'
+
+conn = boto.connect_s3(
+        aws_access_key_id = access_key,
+        aws_secret_access_key = secret_key,
+        host = '{dns_name}',
+        is_secure=False,
+        calling_format = boto.s3.connection.OrdinaryCallingFormat(),
+        )
+bucket = conn.create_bucket('{bucket_name}')
+for bucket in conn.get_all_buckets():
+        print(bucket.name + "\t" + bucket.creation_date)
+""".format(access_key=access_key, secret_key=secret_key, dns_name=dns_name, bucket_name=bucket_name)
+    py_bucket_file = '{testdir}/create_bucket.py'.format(testdir=testdir)
+    client.sudo_write_file(py_bucket_file, create_bucket, mode='0744')
+    client.run(
+        args=[
+            'cat',
+            '{testdir}/create_bucket.py'.format(testdir=testdir),
+        ]
+    )
+    client.run(
+        args=[
+            '{testdir}/venv/bin/python'.format(testdir=testdir),
+            '{testdir}/create_bucket.py'.format(testdir=testdir),
+        ]
+    )
+
+
+def run_s3atest(client, maven_version, testdir, test_options):
+    """
+    Finally run the s3a test
+    """
+    aws_testdir = '{testdir}/hadoop/hadoop-tools/hadoop-aws/'.format(testdir=testdir)
+    run_test = '{testdir}/apache-maven-{maven_version}/bin/mvn'.format(testdir=testdir, maven_version=maven_version)
+    # Remove AWS CredentialsProvider tests as it hits public bucket from AWS
+    # better solution is to create the public bucket on local server and test
+    rm_test = 'rm src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAWSCredentialsProvider.java'
+    client.run(
+        args=[
+            'cd',
+            run.Raw(aws_testdir),
+            run.Raw('&&'),
+            run.Raw(rm_test),
+            run.Raw('&&'),
+            run.Raw(run_test),
+            run.Raw(test_options)
+        ]
+    )
+
+
+def configure_s3a(client, dns_name, access_key, secret_key, bucket_name, testdir):
+    """
+    Use the template to configure s3a test, Fill in access_key, secret_key
+    and other details required for test.
+    """
+    config_template = """<configuration>
+<property>
+<name>fs.s3a.endpoint</name>
+<value>{name}</value>
+</property>
+
+<property>
+<name>fs.contract.test.fs.s3a</name>
+<value>s3a://{bucket_name}/</value>
+</property>
+
+<property>
+<name>fs.s3a.connection.ssl.enabled</name>
+<value>false</value>
+</property>
+
+<property>
+<name>test.fs.s3n.name</name>
+<value>s3n://{bucket_name}/</value>
+</property>
+
+<property>
+<name>test.fs.s3a.name</name>
+<value>s3a://{bucket_name}/</value>
+</property>
+
+<property>
+<name>test.fs.s3.name</name>
+<value>s3://{bucket_name}/</value>
+</property>
+
+<property>
+<name>fs.s3.awsAccessKeyId</name>
+<value>{access_key}</value>
+</property>
+
+<property>
+<name>fs.s3.awsSecretAccessKey</name>
+<value>{secret_key}</value>
+</property>
+
+<property>
+<name>fs.s3n.awsAccessKeyId</name>
+<value>{access_key}</value>
+</property>
+
+<property>
+<name>fs.s3n.awsSecretAccessKey</name>
+<value>{secret_key}</value>
+</property>
+
+<property>
+<name>fs.s3a.access.key</name>
+<description>AWS access key ID. Omit for Role-based authentication.</description>
+<value>{access_key}</value>
+</property>
+
+<property>
+<name>fs.s3a.secret.key</name>
+<description>AWS secret key. Omit for Role-based authentication.</description>
+<value>{secret_key}</value>
+</property>
+</configuration>
+""".format(name=dns_name, bucket_name=bucket_name, access_key=access_key, secret_key=secret_key)
+    config_path = testdir + '/hadoop/hadoop-tools/hadoop-aws/src/test/resources/auth-keys.xml'
+    client.write_file(config_path, config_template)
+    # output for debug
+    client.run(args=['cat', config_path])
diff --git a/qa/tasks/s3tests.py b/qa/tasks/s3tests.py
new file mode 100644
index 000000000..8b40499e4
--- /dev/null
+++ b/qa/tasks/s3tests.py
@@ -0,0 +1,653 @@
+"""
+Run a set of s3 tests on rgw.
+"""
+from io import BytesIO
+from configobj import ConfigObj
+import base64
+import contextlib
+import logging
+import os
+import random
+import string
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.config import config as teuth_config
+from teuthology.orchestra import run
+from teuthology.exceptions import ConfigError
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def download(ctx, config):
+    """
+    Download the s3 tests from the git builder.
+    Remove downloaded s3 file upon exit.
+
+    The context passed in should be identical to the context
+    passed in to the main task.
+    """
+    assert isinstance(config, dict)
+    log.info('Downloading s3-tests...')
+    testdir = teuthology.get_testdir(ctx)
+    for (client, client_config) in config.items():
+        s3tests_branch = client_config.get('force-branch', None)
+        if not s3tests_branch:
+            raise ValueError(
+                "Could not determine what branch to use for s3-tests. Please add 'force-branch: {s3-tests branch name}' to the .yaml config for this s3tests task.")
+
+        log.info("Using branch '%s' for s3tests", s3tests_branch)
+        sha1 = client_config.get('sha1')
+        git_remote = client_config.get('git_remote', teuth_config.ceph_git_base_url)
+        ctx.cluster.only(client).run(
+            args=[
+                'git', 'clone',
+                '-b', s3tests_branch,
+                git_remote + 's3-tests.git',
+                '{tdir}/s3-tests'.format(tdir=testdir),
+                ],
+            )
+        if sha1 is not None:
+            ctx.cluster.only(client).run(
+                args=[
+                    'cd', '{tdir}/s3-tests'.format(tdir=testdir),
+                    run.Raw('&&'),
+                    'git', 'reset', '--hard', sha1,
+                    ],
+                )
+    try:
+        yield
+    finally:
+        log.info('Removing s3-tests...')
+        testdir = teuthology.get_testdir(ctx)
+        for client in config:
+            ctx.cluster.only(client).run(
+                args=[
+                    'rm',
+                    '-rf',
+                    '{tdir}/s3-tests'.format(tdir=testdir),
+                    ],
+                )
+
+
+def _config_user(s3tests_conf, section, user):
+    """
+    Configure users for this section by stashing away keys, ids, and
+    email addresses.
+    """
+    s3tests_conf[section].setdefault('user_id', user)
+    s3tests_conf[section].setdefault('email', '{user}+test@test.test'.format(user=user))
+    s3tests_conf[section].setdefault('display_name', 'Mr. {user}'.format(user=user))
+    s3tests_conf[section].setdefault('access_key',
+        ''.join(random.choice(string.ascii_uppercase) for i in range(20)))
+    s3tests_conf[section].setdefault('secret_key',
+        base64.b64encode(os.urandom(40)).decode())
+    s3tests_conf[section].setdefault('totp_serial',
+        ''.join(random.choice(string.digits) for i in range(10)))
+    s3tests_conf[section].setdefault('totp_seed',
+        base64.b32encode(os.urandom(40)).decode())
+    s3tests_conf[section].setdefault('totp_seconds', '5')
+
+
+@contextlib.contextmanager
+def create_users(ctx, config):
+    """
+    Create a main and an alternate s3 user.
+    """
+    assert isinstance(config, dict)
+    log.info('Creating rgw users...')
+    testdir = teuthology.get_testdir(ctx)
+    
+    if ctx.sts_variable:
+        users = {'s3 main': 'foo', 's3 alt': 'bar', 's3 tenant': 'testx$tenanteduser', 'iam': 'foobar'}
+        for client in config['clients']:
+            s3tests_conf = config['s3tests_conf'][client]
+            s3tests_conf.setdefault('fixtures', {})
+            s3tests_conf['fixtures'].setdefault('bucket prefix', 'test-' + client + '-{random}-')
+            for section, user in users.items():
+                _config_user(s3tests_conf, section, '{user}.{client}'.format(user=user, client=client))
+                log.debug('Creating user {user} on {host}'.format(user=s3tests_conf[section]['user_id'], host=client))
+                cluster_name, daemon_type, client_id = teuthology.split_role(client)
+                client_with_id = daemon_type + '.' + client_id
+                if section=='iam':
+                    ctx.cluster.only(client).run(
+                        args=[
+                            'adjust-ulimits',
+                            'ceph-coverage',
+                            '{tdir}/archive/coverage'.format(tdir=testdir),
+                            'radosgw-admin',
+                            '-n', client_with_id,
+                            'user', 'create',
+                            '--uid', s3tests_conf[section]['user_id'],
+                            '--display-name', s3tests_conf[section]['display_name'],
+                            '--access-key', s3tests_conf[section]['access_key'],
+                            '--secret', s3tests_conf[section]['secret_key'],
+                            '--cluster', cluster_name,
+                        ],
+                    )
+                    ctx.cluster.only(client).run(
+                        args=[
+                            'adjust-ulimits',
+                            'ceph-coverage',
+                            '{tdir}/archive/coverage'.format(tdir=testdir),
+                            'radosgw-admin',
+                            '-n', client_with_id,
+                            'caps', 'add',
+                            '--uid', s3tests_conf[section]['user_id'],
+                            '--caps', 'user-policy=*',
+                            '--cluster', cluster_name,
+                        ],
+                    )
+                    ctx.cluster.only(client).run(
+                        args=[
+                            'adjust-ulimits',
+                            'ceph-coverage',
+                            '{tdir}/archive/coverage'.format(tdir=testdir),
+                            'radosgw-admin',
+                            '-n', client_with_id,
+                            'caps', 'add',
+                            '--uid', s3tests_conf[section]['user_id'],
+                            '--caps', 'roles=*',
+                            '--cluster', cluster_name,
+                        ],
+                    )
+                    ctx.cluster.only(client).run(
+                        args=[
+                            'adjust-ulimits',
+                            'ceph-coverage',
+                            '{tdir}/archive/coverage'.format(tdir=testdir),
+                            'radosgw-admin',
+                            '-n', client_with_id,
+                            'caps', 'add',
+                            '--uid', s3tests_conf[section]['user_id'],
+                            '--caps', 'oidc-provider=*',
+                            '--cluster', cluster_name,
+                        ],
+                    )
+
+                else: 
+                    ctx.cluster.only(client).run(
+                        args=[
+                            'adjust-ulimits',
+                            'ceph-coverage',
+                            '{tdir}/archive/coverage'.format(tdir=testdir),
+                            'radosgw-admin',
+                            '-n', client_with_id,
+                            'user', 'create',
+                            '--uid', s3tests_conf[section]['user_id'],
+                            '--display-name', s3tests_conf[section]['display_name'],
+                            '--access-key', s3tests_conf[section]['access_key'],
+                            '--secret', s3tests_conf[section]['secret_key'],
+                            '--email', s3tests_conf[section]['email'],
+                            '--caps', 'user-policy=*',
+                            '--cluster', cluster_name,
+                        ],
+                    )
+                    ctx.cluster.only(client).run(
+                        args=[
+                            'adjust-ulimits',
+                            'ceph-coverage',
+                            '{tdir}/archive/coverage'.format(tdir=testdir),
+                            'radosgw-admin',
+                            '-n', client_with_id,
+                            'mfa', 'create',
+                            '--uid', s3tests_conf[section]['user_id'],
+                            '--totp-serial', s3tests_conf[section]['totp_serial'],
+                            '--totp-seed', s3tests_conf[section]['totp_seed'],
+                            '--totp-seconds', s3tests_conf[section]['totp_seconds'],
+                            '--totp-window', '8',
+                            '--totp-seed-type', 'base32',
+                            '--cluster', cluster_name,
+                        ],
+                    )
+
+    else:
+        users = {'s3 main': 'foo', 's3 alt': 'bar', 's3 tenant': 'testx$tenanteduser'}
+        for client in config['clients']:
+            s3tests_conf = config['s3tests_conf'][client]
+            s3tests_conf.setdefault('fixtures', {})
+            s3tests_conf['fixtures'].setdefault('bucket prefix', 'test-' + client + '-{random}-')
+            for section, user in users.items():
+                _config_user(s3tests_conf, section, '{user}.{client}'.format(user=user, client=client))
+                log.debug('Creating user {user} on {host}'.format(user=s3tests_conf[section]['user_id'], host=client))
+                cluster_name, daemon_type, client_id = teuthology.split_role(client)
+                client_with_id = daemon_type + '.' + client_id
+                ctx.cluster.only(client).run(
+                        args=[
+                            'adjust-ulimits',
+                            'ceph-coverage',
+                            '{tdir}/archive/coverage'.format(tdir=testdir),
+                            'radosgw-admin',
+                            '-n', client_with_id,
+                            'user', 'create',
+                            '--uid', s3tests_conf[section]['user_id'],
+                            '--display-name', s3tests_conf[section]['display_name'],
+                            '--access-key', s3tests_conf[section]['access_key'],
+                            '--secret', s3tests_conf[section]['secret_key'],
+                            '--email', s3tests_conf[section]['email'],
+                            '--caps', 'user-policy=*',
+                            '--cluster', cluster_name,
+                        ],
+                    )
+                ctx.cluster.only(client).run(
+                        args=[
+                            'adjust-ulimits',
+                            'ceph-coverage',
+                            '{tdir}/archive/coverage'.format(tdir=testdir),
+                            'radosgw-admin',
+                            '-n', client_with_id,
+                            'mfa', 'create',
+                            '--uid', s3tests_conf[section]['user_id'],
+                            '--totp-serial', s3tests_conf[section]['totp_serial'],
+                            '--totp-seed', s3tests_conf[section]['totp_seed'],
+                            '--totp-seconds', s3tests_conf[section]['totp_seconds'],
+                            '--totp-window', '8',
+                            '--totp-seed-type', 'base32',
+                            '--cluster', cluster_name,
+                        ],
+                    )
+
+    if "TOKEN" in os.environ:
+        s3tests_conf.setdefault('webidentity', {})
+        s3tests_conf['webidentity'].setdefault('token',os.environ['TOKEN'])
+        s3tests_conf['webidentity'].setdefault('aud',os.environ['AUD'])
+        s3tests_conf['webidentity'].setdefault('sub',os.environ['SUB'])
+        s3tests_conf['webidentity'].setdefault('azp',os.environ['AZP'])
+        s3tests_conf['webidentity'].setdefault('user_token',os.environ['USER_TOKEN'])
+        s3tests_conf['webidentity'].setdefault('thumbprint',os.environ['THUMBPRINT'])
+        s3tests_conf['webidentity'].setdefault('KC_REALM',os.environ['KC_REALM'])
+
+    try:
+        yield
+    finally:
+        for client in config['clients']:
+            for user in users.values():
+                uid = '{user}.{client}'.format(user=user, client=client)
+                cluster_name, daemon_type, client_id = teuthology.split_role(client)
+                client_with_id = daemon_type + '.' + client_id
+                ctx.cluster.only(client).run(
+                    args=[
+                        'adjust-ulimits',
+                        'ceph-coverage',
+                        '{tdir}/archive/coverage'.format(tdir=testdir),
+                        'radosgw-admin',
+                        '-n', client_with_id,
+                        'user', 'rm',
+                        '--uid', uid,
+                        '--purge-data',
+                        '--cluster', cluster_name,
+                        ],
+                    )
+
+
+@contextlib.contextmanager
+def configure(ctx, config):
+    """
+    Configure the s3-tests.  This includes the running of the
+    bootstrap code and the updating of local conf files.
+    """
+    assert isinstance(config, dict)
+    log.info('Configuring s3-tests...')
+    testdir = teuthology.get_testdir(ctx)
+    for client, properties in config['clients'].items():
+        properties = properties or {}
+        s3tests_conf = config['s3tests_conf'][client]
+        s3tests_conf['DEFAULT']['calling_format'] = properties.get('calling-format', 'ordinary')
+
+        # use rgw_server if given, or default to local client
+        role = properties.get('rgw_server', client)
+
+        endpoint = ctx.rgw.role_endpoints.get(role)
+        assert endpoint, 's3tests: no rgw endpoint for {}'.format(role)
+
+        s3tests_conf['DEFAULT']['host'] = endpoint.dns_name
+
+        website_role = properties.get('rgw_website_server')
+        if website_role:
+            website_endpoint = ctx.rgw.role_endpoints.get(website_role)
+            assert website_endpoint, \
+                    's3tests: no rgw endpoint for rgw_website_server {}'.format(website_role)
+            assert website_endpoint.website_dns_name, \
+                    's3tests: no dns-s3website-name for rgw_website_server {}'.format(website_role)
+            s3tests_conf['DEFAULT']['s3website_domain'] = website_endpoint.website_dns_name
+
+        if hasattr(ctx, 'barbican'):
+            properties = properties['barbican']
+            if properties is not None and 'kms_key' in properties:
+                if not (properties['kms_key'] in ctx.barbican.keys):
+                    raise ConfigError('Key '+properties['kms_key']+' not defined')
+
+                if not (properties['kms_key2'] in ctx.barbican.keys):
+                    raise ConfigError('Key '+properties['kms_key2']+' not defined')
+
+                key = ctx.barbican.keys[properties['kms_key']]
+                s3tests_conf['DEFAULT']['kms_keyid'] = key['id']
+
+                key = ctx.barbican.keys[properties['kms_key2']]
+                s3tests_conf['DEFAULT']['kms_keyid2'] = key['id']
+
+        elif hasattr(ctx, 'vault'):
+            engine_or_flavor = vars(ctx.vault).get('flavor',ctx.vault.engine)
+            keys=[]
+            for name in (x['Path'] for x in vars(ctx.vault).get('keys', {}).get(ctx.rgw.vault_role)):
+                keys.append(name)
+
+            keys.extend(['testkey-1','testkey-2'])
+            if engine_or_flavor == "old":
+                keys=[keys[i] + "/1" for i in range(len(keys))]
+
+            properties = properties.get('vault_%s' % engine_or_flavor, {})
+            s3tests_conf['DEFAULT']['kms_keyid'] = properties.get('key_path', keys[0])
+            s3tests_conf['DEFAULT']['kms_keyid2'] = properties.get('key_path2', keys[1])
+        elif hasattr(ctx.rgw, 'pykmip_role'):
+            keys=[]
+            for name in (x['Name'] for x in ctx.pykmip.keys[ctx.rgw.pykmip_role]):
+                p=name.partition('-')
+                keys.append(p[2] if p[2] else p[0])
+            keys.extend(['testkey-1', 'testkey-2'])
+            s3tests_conf['DEFAULT']['kms_keyid'] = properties.get('kms_key', keys[0])
+            s3tests_conf['DEFAULT']['kms_keyid2'] = properties.get('kms_key2', keys[1])
+        else:
+            # Fallback scenario where it's the local (ceph.conf) kms being tested
+            s3tests_conf['DEFAULT']['kms_keyid'] = 'testkey-1'
+            s3tests_conf['DEFAULT']['kms_keyid2'] = 'testkey-2'
+
+        slow_backend = properties.get('slow_backend')
+        if slow_backend:
+            s3tests_conf['fixtures']['slow backend'] = slow_backend
+
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        remote.run(
+            args=[
+                'cd',
+                '{tdir}/s3-tests'.format(tdir=testdir),
+                run.Raw('&&'),
+                './bootstrap',
+                ],
+            )
+        conf_fp = BytesIO()
+        s3tests_conf.write(conf_fp)
+        remote.write_file(
+            path='{tdir}/archive/s3-tests.{client}.conf'.format(tdir=testdir, client=client),
+            data=conf_fp.getvalue(),
+            )
+
+    log.info('Configuring boto...')
+    boto_src = os.path.join(os.path.dirname(__file__), 'boto.cfg.template')
+    for client, properties in config['clients'].items():
+        with open(boto_src) as f:
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+            conf = f.read().format(
+                idle_timeout=config.get('idle_timeout', 30)
+                )
+            remote.write_file('{tdir}/boto.cfg'.format(tdir=testdir), conf)
+
+    try:
+        yield
+
+    finally:
+        log.info('Cleaning up boto...')
+        for client, properties in config['clients'].items():
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+            remote.run(
+                args=[
+                    'rm',
+                    '{tdir}/boto.cfg'.format(tdir=testdir),
+                    ],
+                )
+
+@contextlib.contextmanager
+def run_tests(ctx, config):
+    """
+    Run the s3tests after everything is set up.
+
+    :param ctx: Context passed to task
+    :param config: specific configuration information
+    """
+    assert isinstance(config, dict)
+    testdir = teuthology.get_testdir(ctx)
+    for client, client_config in config.items():
+        client_config = client_config or {}
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        args = [
+            'S3TEST_CONF={tdir}/archive/s3-tests.{client}.conf'.format(tdir=testdir, client=client),
+            'BOTO_CONFIG={tdir}/boto.cfg'.format(tdir=testdir)
+            ]
+        # the 'requests' library comes with its own ca bundle to verify ssl
+        # certificates - override that to use the system's ca bundle, which
+        # is where the ssl task installed this certificate
+        if remote.os.package_type == 'deb':
+            args += ['REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt']
+        else:
+            args += ['REQUESTS_CA_BUNDLE=/etc/pki/tls/certs/ca-bundle.crt']
+        # civetweb > 1.8 && beast parsers are strict on rfc2616
+        attrs = ["!fails_on_rgw", "!lifecycle_expiration", "!fails_strict_rfc2616","!s3select","!test_of_sts","!webidentity_test"]
+        if client_config.get('calling-format') != 'ordinary':
+            attrs += ['!fails_with_subdomain']
+       
+        if 'extra_attrs' in client_config:
+            attrs = client_config.get('extra_attrs') 
+        args += [
+            '{tdir}/s3-tests/virtualenv/bin/python'.format(tdir=testdir),
+            '-m', 'nose',
+            '-w',
+            '{tdir}/s3-tests'.format(tdir=testdir),
+            '-v',
+            '-a', ','.join(attrs),
+            ]
+        if 'extra_args' in client_config:
+            args.append(client_config['extra_args'])
+
+        remote.run(
+            args=args,
+            label="s3 tests against rgw"
+            )
+    yield
+
+@contextlib.contextmanager
+def scan_for_leaked_encryption_keys(ctx, config):
+    """
+    Scan radosgw logs for the encryption keys used by s3tests to
+    verify that we're not leaking secrets.
+
+    :param ctx: Context passed to task
+    :param config: specific configuration information
+    """
+    assert isinstance(config, dict)
+
+    try:
+        yield
+    finally:
+        # x-amz-server-side-encryption-customer-key
+        s3test_customer_key = 'pO3upElrwuEXSoFwCfnZPdSsmt/xWeFa0N9KgDijwVs='
+
+        log.debug('Scanning radosgw logs for leaked encryption keys...')
+        procs = list()
+        for client, client_config in config.items():
+            if not client_config.get('scan_for_encryption_keys', True):
+                continue
+            cluster_name, daemon_type, client_id = teuthology.split_role(client)
+            client_with_cluster = '.'.join((cluster_name, daemon_type, client_id))
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+            proc = remote.run(
+                args=[
+                    'grep',
+                    '--binary-files=text',
+                    s3test_customer_key,
+                    '/var/log/ceph/rgw.{client}.log'.format(client=client_with_cluster),
+                ],
+                wait=False,
+                check_status=False,
+            )
+            procs.append(proc)
+
+        for proc in procs:
+            proc.wait()
+            if proc.returncode == 1: # 1 means no matches
+                continue
+            log.error('radosgw log is leaking encryption keys!')
+            raise Exception('radosgw log is leaking encryption keys')
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run the s3-tests suite against rgw.
+
+    To run all tests on all clients::
+
+        tasks:
+        - ceph:
+        - rgw:
+        - s3tests:
+
+    To restrict testing to particular clients::
+
+        tasks:
+        - ceph:
+        - rgw: [client.0]
+        - s3tests: [client.0]
+
+    To run against a server on client.1 and increase the boto timeout to 10m::
+
+        tasks:
+        - ceph:
+        - rgw: [client.1]
+        - s3tests:
+            client.0:
+              rgw_server: client.1
+              idle_timeout: 600
+
+    To pass extra arguments to nose (e.g. to run a certain test)::
+
+        tasks:
+        - ceph:
+        - rgw: [client.0]
+        - s3tests:
+            client.0:
+              extra_args: ['test_s3:test_object_acl_grand_public_read']
+            client.1:
+              extra_args: ['--exclude', 'test_100_continue']
+
+    To run any sts-tests don't forget to set a config variable named 'sts_tests' to 'True' as follows::
+
+        tasks:
+        - ceph:
+        - rgw: [client.0]
+        - s3tests:
+            client.0:
+              sts_tests: True
+              rgw_server: client.0
+
+    """
+    assert hasattr(ctx, 'rgw'), 's3tests must run after the rgw task'
+    assert config is None or isinstance(config, list) \
+        or isinstance(config, dict), \
+        "task s3tests only supports a list or dictionary for configuration"
+    all_clients = ['client.{id}'.format(id=id_)
+                   for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    if config is None:
+        config = all_clients
+    if isinstance(config, list):
+        config = dict.fromkeys(config)
+    clients = config.keys()
+
+    overrides = ctx.config.get('overrides', {})
+    # merge each client section, not the top level.
+    for client in config.keys():
+        if not config[client]:
+            config[client] = {}
+        teuthology.deep_merge(config[client], overrides.get('s3tests', {}))
+
+    log.debug('s3tests config is %s', config)
+
+    s3tests_conf = {}
+
+    for client, client_config in config.items():
+        if 'sts_tests' in client_config:
+            ctx.sts_variable = True
+        else:
+            ctx.sts_variable = False
+        #This will be the structure of config file when you want to run webidentity_test (sts-test)
+        if ctx.sts_variable and "TOKEN" in os.environ:
+            for client in clients:
+                endpoint = ctx.rgw.role_endpoints.get(client)
+                assert endpoint, 's3tests: no rgw endpoint for {}'.format(client)
+
+                s3tests_conf[client] = ConfigObj(
+                    indent_type='',
+                    infile={
+                        'DEFAULT':
+                            {
+                            'port'      : endpoint.port,
+                            'is_secure' : endpoint.cert is not None,
+                            'api_name'  : 'default',
+                            },
+                        'fixtures'   : {},
+                        's3 main'    : {},
+                        's3 alt'     : {},
+                        's3 tenant'  : {},
+                        'iam'        : {},
+                        'webidentity': {},
+                    }
+                )
+
+        elif ctx.sts_variable:
+            #This will be the structure of config file when you want to run assume_role_test and get_session_token_test (sts-test)
+            for client in clients:
+                endpoint = ctx.rgw.role_endpoints.get(client)
+                assert endpoint, 's3tests: no rgw endpoint for {}'.format(client)
+
+                s3tests_conf[client] = ConfigObj(
+                    indent_type='',
+                    infile={
+                        'DEFAULT':
+                            {
+                            'port'      : endpoint.port,
+                            'is_secure' : endpoint.cert is not None,
+                            'api_name'  : 'default',
+                            },
+                        'fixtures'   : {},
+                        's3 main'    : {},
+                        's3 alt'     : {},
+                        's3 tenant'  : {},
+                        'iam'        : {},
+                        }
+                    ) 
+
+        else:
+            #This will be the structure of config file when you want to run normal s3-tests
+            for client in clients:
+                endpoint = ctx.rgw.role_endpoints.get(client)
+                assert endpoint, 's3tests: no rgw endpoint for {}'.format(client)
+
+                s3tests_conf[client] = ConfigObj(
+                    indent_type='',
+                    infile={
+                        'DEFAULT':
+                            {
+                            'port'      : endpoint.port,
+                            'is_secure' : endpoint.cert is not None,
+                            'api_name'  : 'default',
+                            },
+                        'fixtures'   : {},
+                        's3 main'    : {},
+                        's3 alt'     : {},
+                        's3 tenant'  : {},
+                        }
+                    )
+
+    with contextutil.nested(
+        lambda: download(ctx=ctx, config=config),
+        lambda: create_users(ctx=ctx, config=dict(
+                clients=clients,
+                s3tests_conf=s3tests_conf,
+                )),
+        lambda: configure(ctx=ctx, config=dict(
+                clients=config,
+                s3tests_conf=s3tests_conf,
+                )),
+        lambda: run_tests(ctx=ctx, config=config),
+        lambda: scan_for_leaked_encryption_keys(ctx=ctx, config=config),
+        ):
+        pass
+    yield
diff --git a/qa/tasks/s3tests_java.py b/qa/tasks/s3tests_java.py
new file mode 100644
index 000000000..dbe03921c
--- /dev/null
+++ b/qa/tasks/s3tests_java.py
@@ -0,0 +1,402 @@
+"""
+Task for running RGW S3 tests with the AWS Java SDK
+"""
+from io import BytesIO
+import logging
+
+import base64
+import os
+import random
+import string
+import yaml
+import getpass
+
+from teuthology import misc as teuthology
+from teuthology.task import Task
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+"""
+    Task for running RGW S3 tests with the AWS Java SDK
+    
+    Tests run only on clients specified in the s3tests-java config section. 
+    If no client is given a default 'client.0' is chosen.
+    If such does not match the rgw client the task will fail.
+        
+        tasks:
+        - ceph:
+        - rgw: [client.0]
+        - s3tests-java:
+            client.0:
+
+    Extra arguments can be passed by adding options to the corresponding client
+    section under the s3tests-java task (e.g. to run a certain test, 
+    specify a different repository and branch for the test suite, 
+    run in info/debug mode (for the java suite) or forward the gradle output to a log file):
+
+        tasks:
+        - ceph:
+        - rgw: [client.0]
+        - s3tests-java:
+            client.0:
+                force-branch: wip
+                force-repo: 'https://github.com/adamyanova/java_s3tests.git'
+                log-fwd: '../s3tests-java.log'
+                log-level: info
+                extra-args: ['--tests', 'ObjectTest.testEncryptionKeySSECInvalidMd5']
+
+    To run a specific test, provide its name to the extra-args section e.g.:
+        - s3tests-java:
+            client.0:
+                extra-args: ['--tests', 'ObjectTest.testEncryptionKeySSECInvalidMd5']
+    
+"""
+
+
+class S3tests_java(Task):
+    """
+    Download and install S3 tests in Java
+    This will require openjdk and gradle
+    """
+
+    def __init__(self, ctx, config):
+        super(S3tests_java, self).__init__(ctx, config)
+        self.log = log
+        log.debug('S3 Tests Java: __INIT__ ')
+        assert hasattr(ctx, 'rgw'), 'S3tests_java must run after the rgw task'
+        clients = ['client.{id}'.format(id=id_)
+                   for id_ in teuthology.all_roles_of_type(self.ctx.cluster, 'client')]
+        self.all_clients = []
+        for client in clients:
+            if client in self.config:
+                self.all_clients.extend([client])
+        if self.all_clients is None:
+            self.all_clients = 'client.0'
+        self.users = {'s3main': 'tester',
+                      's3alt': 'johndoe', 'tenanted': 'testx$tenanteduser'}
+
+    def setup(self):
+        super(S3tests_java, self).setup()
+        log.debug('S3 Tests Java: SETUP')
+        for client in self.all_clients:
+            self.download_test_suite(client)
+            self.install_required_packages(client)
+
+    def begin(self):
+        super(S3tests_java, self).begin()
+        log.debug('S3 Tests Java: BEGIN')
+        for (host, roles) in self.ctx.cluster.remotes.items():
+            log.debug(
+                'S3 Tests Java: Cluster config is: {cfg}'.format(cfg=roles))
+            log.debug('S3 Tests Java: Host is: {host}'.format(host=host))
+        self.create_users()
+        self.run_tests()
+
+    def end(self):
+        super(S3tests_java, self).end()
+        log.debug('S3 Tests Java: END')
+        for client in self.all_clients:
+            self.remove_tests(client)
+            self.delete_users(client)
+
+    def download_test_suite(self, client):
+        log.info("S3 Tests Java: Downloading test suite...")
+        testdir = teuthology.get_testdir(self.ctx)
+        branch = 'master'
+        repo = 'https://github.com/ceph/java_s3tests.git'
+        if client in self.config and self.config[client] is not None:
+            if 'force-branch' in self.config[client] and self.config[client]['force-branch'] is not None:
+                branch = self.config[client]['force-branch']
+            if 'force-repo' in self.config[client] and self.config[client]['force-repo'] is not None:
+                repo = self.config[client]['force-repo']
+        self.ctx.cluster.only(client).run(
+            args=[
+                'git', 'clone',
+                '-b', branch,
+                repo,
+                '{tdir}/s3-tests-java'.format(tdir=testdir),
+            ],
+            stdout=BytesIO()
+        )
+        if client in self.config and self.config[client] is not None:
+            if 'sha1' in self.config[client] and self.config[client]['sha1'] is not None:
+                self.ctx.cluster.only(client).run(
+                    args=[
+                        'cd', '{tdir}/s3-tests-java'.format(tdir=testdir),
+                        run.Raw('&&'),
+                        'git', 'reset', '--hard', self.config[client]['sha1'],
+                    ],
+                )
+
+            if 'log-level' in self.config[client]:
+                if self.config[client]['log-level'] == 'info':
+                    self.ctx.cluster.only(client).run(
+                        args=[
+                            'sed', '-i', '\'s/log4j.rootLogger=WARN/log4j.rootLogger=INFO/g\'',
+                            '{tdir}/s3-tests-java/src/main/resources/log4j.properties'.format(
+                                tdir=testdir)
+                        ]
+                    )
+                if self.config[client]['log-level'] == 'debug':
+                    self.ctx.cluster.only(client).run(
+                        args=[
+                            'sed', '-i', '\'s/log4j.rootLogger=WARN/log4j.rootLogger=DEBUG/g\'',
+                            '{tdir}/s3-tests-java/src/main/resources/log4j.properties'.format(
+                                tdir=testdir)
+                        ]
+                    )
+
+    def install_required_packages(self, client):
+        """
+        Run bootstrap script to install openjdk and gradle.
+        Add certificates to java keystore
+        """
+        log.info("S3 Tests Java: Installing required packages...")
+        testdir = teuthology.get_testdir(self.ctx)
+        self.ctx.cluster.only(client).run(
+            args=['{tdir}/s3-tests-java/bootstrap.sh'.format(tdir=testdir)],
+            stdout=BytesIO()
+        )
+
+        endpoint = self.ctx.rgw.role_endpoints[client]
+        if endpoint.cert:
+            path = 'lib/security/cacerts'
+            self.ctx.cluster.only(client).run(
+                args=['sudo',
+                      'keytool',
+                      '-import', '-alias', '{alias}'.format(
+                          alias=endpoint.hostname),
+                      '-keystore',
+                      run.Raw(
+                          '$(readlink -e $(dirname $(readlink -e $(which keytool)))/../{path})'.format(path=path)),
+                      '-file', endpoint.cert.certificate,
+                      '-storepass', 'changeit',
+                      ],
+                stdout=BytesIO()
+            )
+
+    def create_users(self):
+        """
+        Create a main and an alternative s3 user.
+        Configuration is read from a skelethon config file
+        s3tests.teuth.config.yaml in the java-s3tests repository
+        and missing information is added from the task.
+        Existing values are NOT overriden unless they are empty!
+        """
+        log.info("S3 Tests Java: Creating S3 users...")
+        testdir = teuthology.get_testdir(self.ctx)
+        for client in self.all_clients:
+            endpoint = self.ctx.rgw.role_endpoints.get(client)
+            local_user = getpass.getuser()
+            remote_user = teuthology.get_test_user()
+            os.system("scp {remote}@{host}:{tdir}/s3-tests-java/s3tests.teuth.config.yaml /home/{local}/".format(
+                host=endpoint.hostname, tdir=testdir, remote=remote_user, local=local_user))
+            s3tests_conf = teuthology.config_file(
+                '/home/{local}/s3tests.teuth.config.yaml'.format(local=local_user))
+            log.debug("S3 Tests Java: s3tests_conf is {s3cfg}".format(
+                s3cfg=s3tests_conf))
+            for section, user in list(self.users.items()):
+                if section in s3tests_conf:
+                    s3_user_id = '{user}.{client}'.format(
+                        user=user, client=client)
+                    log.debug(
+                        'S3 Tests Java: Creating user {s3_user_id}'.format(s3_user_id=s3_user_id))
+                    self._config_user(s3tests_conf=s3tests_conf,
+                                      section=section, user=s3_user_id, client=client)
+                    cluster_name, daemon_type, client_id = teuthology.split_role(
+                        client)
+                    client_with_id = daemon_type + '.' + client_id
+                    args = [
+                        'adjust-ulimits',
+                        'ceph-coverage',
+                        '{tdir}/archive/coverage'.format(tdir=testdir),
+                        'radosgw-admin',
+                        '-n', client_with_id,
+                        'user', 'create',
+                        '--uid', s3tests_conf[section]['user_id'],
+                        '--display-name', s3tests_conf[section]['display_name'],
+                        '--access-key', s3tests_conf[section]['access_key'],
+                        '--secret', s3tests_conf[section]['access_secret'],
+                        '--email', s3tests_conf[section]['email'],
+                        '--cluster', cluster_name,
+                    ]
+                    log.info('{args}'.format(args=args))
+                    self.ctx.cluster.only(client).run(
+                        args=args,
+                        stdout=BytesIO()
+                    )
+                else:
+                    self.users.pop(section)
+            self._write_cfg_file(s3tests_conf, client)
+            os.system(
+                "rm -rf /home/{local}/s3tests.teuth.config.yaml".format(local=local_user))
+
+    def _config_user(self, s3tests_conf, section, user, client):
+        """
+        Generate missing users data for this section by stashing away keys, ids, and
+        email addresses.
+        """
+        access_key = ''.join(random.choice(string.ascii_uppercase)
+                             for i in range(20))
+        access_secret = base64.b64encode(os.urandom(40)).decode('ascii')
+        endpoint = self.ctx.rgw.role_endpoints.get(client)
+
+        self._set_cfg_entry(
+            s3tests_conf[section], 'user_id', '{user}'.format(user=user))
+        self._set_cfg_entry(
+            s3tests_conf[section], 'email', '{user}_test@test.test'.format(user=user))
+        self._set_cfg_entry(
+            s3tests_conf[section], 'display_name', 'Ms. {user}'.format(user=user))
+        self._set_cfg_entry(
+            s3tests_conf[section], 'access_key', '{ak}'.format(ak=access_key))
+        self._set_cfg_entry(
+            s3tests_conf[section], 'access_secret', '{asc}'.format(asc=access_secret))
+        self._set_cfg_entry(
+            s3tests_conf[section], 'region', 'us-east-1')
+        self._set_cfg_entry(
+            s3tests_conf[section], 'endpoint', '{ip}:{port}'.format(
+                ip=endpoint.hostname, port=endpoint.port))
+        self._set_cfg_entry(
+            s3tests_conf[section], 'host', endpoint.hostname)
+        self._set_cfg_entry(
+            s3tests_conf[section], 'port', endpoint.port)
+        self._set_cfg_entry(
+            s3tests_conf[section], 'is_secure', True if endpoint.cert else False)
+
+        log.debug("S3 Tests Java: s3tests_conf[{sect}] is {s3cfg}".format(
+            sect=section, s3cfg=s3tests_conf[section]))
+        log.debug('S3 Tests Java: Setion, User = {sect}, {user}'.format(
+            sect=section, user=user))
+
+    def _write_cfg_file(self, cfg_dict, client):
+        """
+        Write s3 tests java config file on the remote node.
+        """
+        testdir = teuthology.get_testdir(self.ctx)
+        (remote,) = self.ctx.cluster.only(client).remotes.keys()
+        data = yaml.safe_dump(cfg_dict, default_flow_style=False)
+        path = testdir + '/archive/s3-tests-java.' + client + '.conf'
+        remote.write_file(path, data)
+
+    def _set_cfg_entry(self, cfg_dict, key, value):
+        if not (key in cfg_dict):
+            cfg_dict.setdefault(key, value)
+        elif cfg_dict[key] is None:
+            cfg_dict[key] = value
+
+    def run_tests(self):
+        log.info("S3 Tests Java: Running tests...")
+        testdir = teuthology.get_testdir(self.ctx)
+        for client in self.all_clients:
+            self.ctx.cluster.only(client).run(
+                args=['cp',
+                      '{tdir}/archive/s3-tests-java.{client}.conf'.format(
+                          tdir=testdir, client=client),
+                      '{tdir}/s3-tests-java/config.properties'.format(
+                          tdir=testdir)
+                      ],
+                stdout=BytesIO()
+            )
+            args = ['cd',
+                    '{tdir}/s3-tests-java'.format(tdir=testdir),
+                    run.Raw('&&'),
+                    '/opt/gradle/gradle/bin/gradle', 'clean', 'test',
+                    '--rerun-tasks', '--no-build-cache',
+                    ]
+            extra_args = []
+            suppress_groups = False
+            self.log_fwd = False
+            self.log_name = ''
+            if client in self.config and self.config[client] is not None:
+                if 'extra-args' in self.config[client]:
+                    extra_args.extend(self.config[client]['extra-args'])
+                    suppress_groups = True
+                if 'log-level' in self.config[client] and self.config[client]['log-level'] == 'debug':
+                    extra_args += ['--debug']
+                if 'log-fwd' in self.config[client]:
+                    self.log_fwd = True
+                    self.log_name = '{tdir}/s3tests_log.txt'.format(
+                        tdir=testdir)
+                    if self.config[client]['log-fwd'] is not None:
+                        self.log_name = self.config[client]['log-fwd']
+                    extra_args += [run.Raw('>>'),
+                                   self.log_name]
+
+            if not suppress_groups:
+                test_groups = ['AWS4Test', 'BucketTest', 'ObjectTest']
+            else:
+                test_groups = ['All']
+
+            for gr in test_groups:
+                for i in range(2):
+                    self.ctx.cluster.only(client).run(
+                        args=['radosgw-admin', 'gc',
+                              'process', '--include-all'],
+                        stdout=BytesIO()
+                    )
+
+                if gr != 'All':
+                    self.ctx.cluster.only(client).run(
+                        args=args + ['--tests'] + [gr] + extra_args,
+                        stdout=BytesIO()
+                    )
+                else:
+                    self.ctx.cluster.only(client).run(
+                        args=args + extra_args,
+                        stdout=BytesIO()
+                    )
+
+                for i in range(2):
+                    self.ctx.cluster.only(client).run(
+                        args=['radosgw-admin', 'gc',
+                              'process', '--include-all'],
+                        stdout=BytesIO()
+                    )
+
+    def remove_tests(self, client):
+        log.info('S3 Tests Java: Cleaning up s3-tests-java...')
+        testdir = teuthology.get_testdir(self.ctx)
+
+        if self.log_fwd:
+            self.ctx.cluster.only(client).run(
+                args=['cd',
+                      '{tdir}/s3-tests-java'.format(tdir=testdir),
+                      run.Raw('&&'),
+                      'cat', self.log_name,
+                      run.Raw('&&'),
+                      'rm', self.log_name],
+                stdout=BytesIO()
+            )
+
+        self.ctx.cluster.only(client).run(
+            args=[
+                'rm',
+                '-rf',
+                '{tdir}/s3-tests-java'.format(tdir=testdir),
+            ],
+            stdout=BytesIO()
+        )
+
+    def delete_users(self, client):
+        log.info("S3 Tests Java: Deleting S3 users...")
+        testdir = teuthology.get_testdir(self.ctx)
+        for section, user in self.users.items():
+            s3_user_id = '{user}.{client}'.format(user=user, client=client)
+            self.ctx.cluster.only(client).run(
+                args=[
+                    'adjust-ulimits',
+                    'ceph-coverage',
+                    '{tdir}/archive/coverage'.format(tdir=testdir),
+                    'radosgw-admin',
+                    '-n', client,
+                    'user', 'rm',
+                    '--uid', s3_user_id,
+                    '--purge-data',
+                    '--cluster', 'ceph',
+                ],
+                stdout=BytesIO()
+            )
+
+
+task = S3tests_java
diff --git a/qa/tasks/samba.py b/qa/tasks/samba.py
new file mode 100644
index 000000000..bcc247697
--- /dev/null
+++ b/qa/tasks/samba.py
@@ -0,0 +1,244 @@
+"""
+Samba
+"""
+import contextlib
+import logging
+import time
+
+from teuthology import misc as teuthology
+from teuthology.orchestra import run
+from teuthology.orchestra.daemon import DaemonGroup
+
+log = logging.getLogger(__name__)
+
+
+def get_sambas(ctx, roles):
+    """
+    Scan for roles that are samba.  Yield the id of the the samba role
+    (samba.0, samba.1...)  and the associated remote site
+
+    :param ctx: Context
+    :param roles: roles for this test (extracted from yaml files)
+    """
+    for role in roles:
+        assert isinstance(role, str)
+        PREFIX = 'samba.'
+        assert role.startswith(PREFIX)
+        id_ = role[len(PREFIX):]
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+        yield (id_, remote)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Setup samba smbd with ceph vfs module.  This task assumes the samba
+    package has already been installed via the install task.
+
+    The config is optional and defaults to starting samba on all nodes.
+    If a config is given, it is expected to be a list of
+    samba nodes to start smbd servers on.
+
+    Example that starts smbd on all samba nodes::
+
+        tasks:
+        - install:
+        - install:
+            project: samba
+            extra_packages: ['samba']
+        - ceph:
+        - samba:
+        - interactive:
+
+    Example that starts smbd on just one of the samba nodes and cifs on the other::
+
+        tasks:
+        - samba: [samba.0]
+        - cifs: [samba.1]
+
+    An optional backend can be specified, and requires a path which smbd will
+    use as the backend storage location:
+
+        roles:
+            - [osd.0, osd.1, osd.2, mon.0, mon.1, mon.2, mds.a]
+            - [client.0, samba.0]
+
+        tasks:
+        - ceph:
+        - ceph-fuse: [client.0]
+        - samba:
+            samba.0:
+              cephfuse: "{testdir}/mnt.0"
+
+    This mounts ceph to {testdir}/mnt.0 using fuse, and starts smbd with
+    a UNC of //localhost/cephfuse.  Access through that UNC will be on
+    the ceph fuse mount point.
+
+    If no arguments are specified in the samba
+    role, the default behavior is to enable the ceph UNC //localhost/ceph
+    and use the ceph vfs module as the smbd backend.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    log.info("Setting up smbd with ceph vfs...")
+    assert config is None or isinstance(config, list) or isinstance(config, dict), \
+        "task samba got invalid config"
+
+    if config is None:
+        config = dict(('samba.{id}'.format(id=id_), None)
+                  for id_ in teuthology.all_roles_of_type(ctx.cluster, 'samba'))
+    elif isinstance(config, list):
+        config = dict((name, None) for name in config)
+
+    samba_servers = list(get_sambas(ctx=ctx, roles=config.keys()))
+
+    testdir = teuthology.get_testdir(ctx)
+
+    if not hasattr(ctx, 'daemons'):
+        ctx.daemons = DaemonGroup()
+
+    for id_, remote in samba_servers:
+
+        rolestr = "samba.{id_}".format(id_=id_)
+
+        confextras = """vfs objects = ceph
+  ceph:config_file = /etc/ceph/ceph.conf"""
+
+        unc = "ceph"
+        backend = "/"
+
+        if config[rolestr] is not None:
+            # verify that there's just one parameter in role
+            if len(config[rolestr]) != 1:
+                log.error("samba config for role samba.{id_} must have only one parameter".format(id_=id_))
+                raise Exception('invalid config')
+            confextras = ""
+            (unc, backendstr) = config[rolestr].items()[0]
+            backend = backendstr.format(testdir=testdir)
+
+        # on first samba role, set ownership and permissions of ceph root
+        # so that samba tests succeed
+        if config[rolestr] is None and id_ == samba_servers[0][0]:
+            remote.run(
+                    args=[
+                        'mkdir', '-p', '/tmp/cmnt', run.Raw('&&'),
+                        'sudo', 'ceph-fuse', '/tmp/cmnt', run.Raw('&&'),
+                        'sudo', 'chown', 'ubuntu:ubuntu', '/tmp/cmnt/', run.Raw('&&'),
+                        'sudo', 'chmod', '1777', '/tmp/cmnt/', run.Raw('&&'),
+                        'sudo', 'umount', '/tmp/cmnt/', run.Raw('&&'),
+                        'rm', '-rf', '/tmp/cmnt',
+                        ],
+                    )
+        else:
+            remote.run(
+                    args=[
+                        'sudo', 'chown', 'ubuntu:ubuntu', backend, run.Raw('&&'),
+                        'sudo', 'chmod', '1777', backend,
+                        ],
+                    )
+
+        remote.sudo_write_file("/usr/local/samba/etc/smb.conf", """
+[global]
+  workgroup = WORKGROUP
+  netbios name = DOMAIN
+
+[{unc}]
+  path = {backend}
+  {extras}
+  writeable = yes
+  valid users = ubuntu
+""".format(extras=confextras, unc=unc, backend=backend))
+
+        # create ubuntu user
+        remote.run(
+            args=[
+                'sudo', '/usr/local/samba/bin/smbpasswd', '-e', 'ubuntu',
+                run.Raw('||'),
+                'printf', run.Raw('"ubuntu\nubuntu\n"'),
+                run.Raw('|'),
+                'sudo', '/usr/local/samba/bin/smbpasswd', '-s', '-a', 'ubuntu'
+            ])
+
+        smbd_cmd = [
+                'sudo',
+                'daemon-helper',
+                'term',
+                'nostdin',
+                '/usr/local/samba/sbin/smbd',
+                '-F',
+                ]
+        ctx.daemons.add_daemon(remote, 'smbd', id_,
+                               args=smbd_cmd,
+                               logger=log.getChild("smbd.{id_}".format(id_=id_)),
+                               stdin=run.PIPE,
+                               wait=False,
+                               )
+
+        # let smbd initialize, probably a better way...
+        seconds_to_sleep = 100
+        log.info('Sleeping for %s  seconds...' % seconds_to_sleep)
+        time.sleep(seconds_to_sleep)
+        log.info('Sleeping stopped...')
+
+    try:
+        yield
+    finally:
+        log.info('Stopping smbd processes...')
+        exc = None
+        for d in ctx.daemons.iter_daemons_of_role('smbd'):
+            try:
+                d.stop()
+            except (run.CommandFailedError,
+                    run.CommandCrashedError,
+                    run.ConnectionLostError) as e:
+                exc = e
+                log.exception('Saw exception from %s.%s', d.role, d.id_)
+        if exc is not None:
+            raise exc
+
+        for id_, remote in samba_servers:
+            remote.run(
+                args=[
+                    'sudo',
+                    'rm', '-rf',
+                    '/usr/local/samba/etc/smb.conf',
+                    '/usr/local/samba/private/*',
+                    '/usr/local/samba/var/run/',
+                    '/usr/local/samba/var/locks',
+                    '/usr/local/samba/var/lock',
+                    ],
+                )
+            # make sure daemons are gone
+            try:
+                remote.run(
+                    args=[
+                        'while',
+                        'sudo', 'killall', '-9', 'smbd',
+                        run.Raw(';'),
+                        'do', 'sleep', '1',
+                        run.Raw(';'),
+                        'done',
+                        ],
+                    )
+
+                remote.run(
+                    args=[
+                        'sudo',
+                        'lsof',
+                        backend,
+                        ],
+                    check_status=False
+                    )
+                remote.run(
+                    args=[
+                        'sudo',
+                        'fuser',
+                        '-M',
+                        backend,
+                        ],
+                    check_status=False
+                    )
+            except Exception:
+                log.exception("Saw exception")
+                pass
diff --git a/qa/tasks/scrub.py b/qa/tasks/scrub.py
new file mode 100644
index 000000000..ddc1a9164
--- /dev/null
+++ b/qa/tasks/scrub.py
@@ -0,0 +1,117 @@
+"""
+Scrub osds
+"""
+import contextlib
+import gevent
+import logging
+import random
+import time
+
+from tasks import ceph_manager
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run scrub periodically. Randomly chooses an OSD to scrub.
+
+    The config should be as follows:
+
+    scrub:
+        frequency: <seconds between scrubs>
+        deep: <bool for deepness>
+
+    example:
+
+    tasks:
+    - ceph:
+    - scrub:
+        frequency: 30
+        deep: 0
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'scrub task only accepts a dict for configuration'
+
+    log.info('Beginning scrub...')
+
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+    while len(manager.get_osd_status()['up']) < num_osds:
+        time.sleep(10)
+
+    scrub_proc = Scrubber(
+        manager,
+        config,
+        )
+    try:
+        yield
+    finally:
+        log.info('joining scrub')
+        scrub_proc.do_join()
+
+class Scrubber:
+    """
+    Scrubbing is actually performed during initialization
+    """
+    def __init__(self, manager, config):
+        """
+        Spawn scrubbing thread upon completion.
+        """
+        self.ceph_manager = manager
+        self.ceph_manager.wait_for_clean()
+
+        osd_status = self.ceph_manager.get_osd_status()
+        self.osds = osd_status['up']
+
+        self.config = config
+        if self.config is None:
+            self.config = dict()
+
+        else:
+            def tmp(x):
+                """Local display"""
+                print(x)
+            self.log = tmp
+
+        self.stopping = False
+
+        log.info("spawning thread")
+
+        self.thread = gevent.spawn(self.do_scrub)
+
+    def do_join(self):
+        """Scrubbing thread finished"""
+        self.stopping = True
+        self.thread.get()
+
+    def do_scrub(self):
+        """Perform the scrub operation"""
+        frequency = self.config.get("frequency", 30)
+        deep = self.config.get("deep", 0)
+
+        log.info("stopping %s" % self.stopping)
+
+        while not self.stopping:
+            osd = str(random.choice(self.osds))
+
+            if deep:
+                cmd = 'deep-scrub'
+            else:
+                cmd = 'scrub'
+
+            log.info('%sbing %s' % (cmd, osd))
+            self.ceph_manager.raw_cluster_cmd('osd', cmd, osd)
+
+            time.sleep(frequency)
diff --git a/qa/tasks/scrub_test.py b/qa/tasks/scrub_test.py
new file mode 100644
index 000000000..3d629e9d7
--- /dev/null
+++ b/qa/tasks/scrub_test.py
@@ -0,0 +1,401 @@
+"""Scrub testing"""
+
+import contextlib
+import json
+import logging
+import os
+import time
+import tempfile
+
+from tasks import ceph_manager
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+
+def wait_for_victim_pg(manager):
+    """Return a PG with some data and its acting set"""
+    # wait for some PG to have data that we can mess with
+    victim = None
+    while victim is None:
+        stats = manager.get_pg_stats()
+        for pg in stats:
+            size = pg['stat_sum']['num_bytes']
+            if size > 0:
+                victim = pg['pgid']
+                acting = pg['acting']
+                return victim, acting
+        time.sleep(3)
+
+
+def find_victim_object(ctx, pg, osd):
+    """Return a file to be fuzzed"""
+    (osd_remote,) = ctx.cluster.only('osd.%d' % osd).remotes.keys()
+    data_path = os.path.join(
+        '/var/lib/ceph/osd',
+        'ceph-{id}'.format(id=osd),
+        'fuse',
+        '{pg}_head'.format(pg=pg),
+        'all',
+        )
+
+    # fuzz time
+    ls_out = osd_remote.sh('sudo ls %s' % data_path)
+
+    # find an object file we can mess with (and not the pg info object)
+    osdfilename = next(line for line in ls_out.split('\n')
+                       if not line.endswith('::::head#'))
+    assert osdfilename is not None
+
+    # Get actual object name from osd stored filename
+    objname = osdfilename.split(':')[4]
+    return osd_remote, os.path.join(data_path, osdfilename), objname
+
+
+def corrupt_file(osd_remote, path):
+    # put a single \0 at the beginning of the file
+    osd_remote.run(
+        args=['sudo', 'dd',
+              'if=/dev/zero',
+              'of=%s/data' % path,
+              'bs=1', 'count=1', 'conv=notrunc']
+    )
+
+
+def get_pgnum(pgid):
+    pos = pgid.find('.')
+    assert pos != -1
+    return pgid[pos+1:]
+
+
+def deep_scrub(manager, victim, pool):
+    # scrub, verify inconsistent
+    pgnum = get_pgnum(victim)
+    manager.do_pg_scrub(pool, pgnum, 'deep-scrub')
+
+    stats = manager.get_single_pg_stats(victim)
+    inconsistent = stats['state'].find('+inconsistent') != -1
+    assert inconsistent
+
+
+def repair(manager, victim, pool):
+    # repair, verify no longer inconsistent
+    pgnum = get_pgnum(victim)
+    manager.do_pg_scrub(pool, pgnum, 'repair')
+
+    stats = manager.get_single_pg_stats(victim)
+    inconsistent = stats['state'].find('+inconsistent') != -1
+    assert not inconsistent
+
+
+def test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path, pool):
+    corrupt_file(osd_remote, obj_path)
+    deep_scrub(manager, pg, pool)
+    repair(manager, pg, pool)
+
+
+def test_repair_bad_omap(ctx, manager, pg, osd, objname):
+    # Test deep-scrub with various omap modifications
+    # Modify omap on specific osd
+    log.info('fuzzing omap of %s' % objname)
+    manager.osd_admin_socket(osd, ['rmomapkey', 'rbd', objname, 'key'])
+    manager.osd_admin_socket(osd, ['setomapval', 'rbd', objname,
+                                   'badkey', 'badval'])
+    manager.osd_admin_socket(osd, ['setomapheader', 'rbd', objname, 'badhdr'])
+
+    deep_scrub(manager, pg, 'rbd')
+    # please note, the repair here is errnomous, it rewrites the correct omap
+    # digest and data digest on the replicas with the corresponding digests
+    # from the primary osd which is hosting the victim object, see
+    # find_victim_object().
+    # so we need to either put this test and the end of this task or
+    # undo the mess-up manually before the "repair()" that just ensures
+    # the cleanup is sane, otherwise the succeeding tests will fail. if they
+    # try set "badkey" in hope to get an "inconsistent" pg with a deep-scrub.
+    manager.osd_admin_socket(osd, ['setomapheader', 'rbd', objname, 'hdr'])
+    manager.osd_admin_socket(osd, ['rmomapkey', 'rbd', objname, 'badkey'])
+    manager.osd_admin_socket(osd, ['setomapval', 'rbd', objname,
+                                   'key', 'val'])
+    repair(manager, pg, 'rbd')
+
+
+class MessUp:
+    def __init__(self, manager, osd_remote, pool, osd_id,
+                 obj_name, obj_path, omap_key, omap_val):
+        self.manager = manager
+        self.osd = osd_remote
+        self.pool = pool
+        self.osd_id = osd_id
+        self.obj = obj_name
+        self.path = obj_path
+        self.omap_key = omap_key
+        self.omap_val = omap_val
+
+    @contextlib.contextmanager
+    def _test_with_file(self, messup_cmd, *checks):
+        temp = tempfile.mktemp()
+        backup_cmd = ['sudo', 'cp', os.path.join(self.path, 'data'), temp]
+        self.osd.run(args=backup_cmd)
+        self.osd.run(args=messup_cmd.split())
+        yield checks
+        create_cmd = ['sudo', 'mkdir', self.path]
+        self.osd.run(args=create_cmd, check_status=False)
+        restore_cmd = ['sudo', 'cp', temp, os.path.join(self.path, 'data')]
+        self.osd.run(args=restore_cmd)
+
+    def remove(self):
+        cmd = 'sudo rmdir {path}'.format(path=self.path)
+        return self._test_with_file(cmd, 'missing')
+
+    def append(self):
+        cmd = 'sudo dd if=/dev/zero of={path}/data bs=1 count=1 ' \
+              'conv=notrunc oflag=append'.format(path=self.path)
+        return self._test_with_file(cmd,
+                                    'data_digest_mismatch',
+                                    'size_mismatch')
+
+    def truncate(self):
+        cmd = 'sudo dd if=/dev/null of={path}/data'.format(path=self.path)
+        return self._test_with_file(cmd,
+                                    'data_digest_mismatch',
+                                    'size_mismatch')
+
+    def change_obj(self):
+        cmd = 'sudo dd if=/dev/zero of={path}/data bs=1 count=1 ' \
+              'conv=notrunc'.format(path=self.path)
+        return self._test_with_file(cmd,
+                                    'data_digest_mismatch')
+
+    @contextlib.contextmanager
+    def rm_omap(self):
+        cmd = ['rmomapkey', self.pool, self.obj, self.omap_key]
+        self.manager.osd_admin_socket(self.osd_id, cmd)
+        yield ('omap_digest_mismatch',)
+        cmd = ['setomapval', self.pool, self.obj,
+               self.omap_key, self.omap_val]
+        self.manager.osd_admin_socket(self.osd_id, cmd)
+
+    @contextlib.contextmanager
+    def add_omap(self):
+        cmd = ['setomapval', self.pool, self.obj, 'badkey', 'badval']
+        self.manager.osd_admin_socket(self.osd_id, cmd)
+        yield ('omap_digest_mismatch',)
+        cmd = ['rmomapkey', self.pool, self.obj, 'badkey']
+        self.manager.osd_admin_socket(self.osd_id, cmd)
+
+    @contextlib.contextmanager
+    def change_omap(self):
+        cmd = ['setomapval', self.pool, self.obj, self.omap_key, 'badval']
+        self.manager.osd_admin_socket(self.osd_id, cmd)
+        yield ('omap_digest_mismatch',)
+        cmd = ['setomapval', self.pool, self.obj, self.omap_key, self.omap_val]
+        self.manager.osd_admin_socket(self.osd_id, cmd)
+
+
+class InconsistentObjChecker:
+    """Check the returned inconsistents/inconsistent info"""
+
+    def __init__(self, osd, acting, obj_name):
+        self.osd = osd
+        self.acting = acting
+        self.obj = obj_name
+        assert self.osd in self.acting
+
+    def basic_checks(self, inc):
+        assert inc['object']['name'] == self.obj
+        assert inc['object']['snap'] == "head"
+        assert len(inc['shards']) == len(self.acting), \
+            "the number of returned shard does not match with the acting set"
+
+    def run(self, check, inc):
+        func = getattr(self, check)
+        func(inc)
+
+    def _check_errors(self, inc, err_name):
+        bad_found = False
+        good_found = False
+        for shard in inc['shards']:
+            log.info('shard = %r' % shard)
+            log.info('err = %s' % err_name)
+            assert 'osd' in shard
+            osd = shard['osd']
+            err = err_name in shard['errors']
+            if osd == self.osd:
+                assert bad_found is False, \
+                    "multiple entries found for the given OSD"
+                assert err is True, \
+                    "Didn't find '{err}' in errors".format(err=err_name)
+                bad_found = True
+            else:
+                assert osd in self.acting, "shard not in acting set"
+                assert err is False, \
+                    "Expected '{err}' in errors".format(err=err_name)
+                good_found = True
+        assert bad_found is True, \
+            "Shard for osd.{osd} not found".format(osd=self.osd)
+        assert good_found is True, \
+            "No other acting shards found"
+
+    def _check_attrs(self, inc, attr_name):
+        bad_attr = None
+        good_attr = None
+        for shard in inc['shards']:
+            log.info('shard = %r' % shard)
+            log.info('attr = %s' % attr_name)
+            assert 'osd' in shard
+            osd = shard['osd']
+            attr = shard.get(attr_name, False)
+            if osd == self.osd:
+                assert bad_attr is None, \
+                    "multiple entries found for the given OSD"
+                bad_attr = attr
+            else:
+                assert osd in self.acting, "shard not in acting set"
+                assert good_attr is None or good_attr == attr, \
+                    "multiple good attrs found"
+                good_attr = attr
+        assert bad_attr is not None, \
+            "bad {attr} not found".format(attr=attr_name)
+        assert good_attr is not None, \
+            "good {attr} not found".format(attr=attr_name)
+        assert good_attr != bad_attr, \
+            "bad attr is identical to the good ones: " \
+            "{0} == {1}".format(good_attr, bad_attr)
+
+    def data_digest_mismatch(self, inc):
+        assert 'data_digest_mismatch' in inc['errors']
+        self._check_attrs(inc, 'data_digest')
+
+    def missing(self, inc):
+        assert 'missing' in inc['union_shard_errors']
+        self._check_errors(inc, 'missing')
+
+    def size_mismatch(self, inc):
+        assert 'size_mismatch' in inc['errors']
+        self._check_attrs(inc, 'size')
+
+    def omap_digest_mismatch(self, inc):
+        assert 'omap_digest_mismatch' in inc['errors']
+        self._check_attrs(inc, 'omap_digest')
+
+
+def test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd_id,
+                               obj_name, obj_path):
+    mon = manager.controller
+    pool = 'rbd'
+    omap_key = 'key'
+    omap_val = 'val'
+    manager.do_rados(['setomapval', obj_name, omap_key, omap_val], pool=pool)
+    # Update missing digests, requires "osd deep scrub update digest min age: 0"
+    pgnum = get_pgnum(pg)
+    manager.do_pg_scrub(pool, pgnum, 'deep-scrub')
+
+    messup = MessUp(manager, osd_remote, pool, osd_id, obj_name, obj_path,
+                    omap_key, omap_val)
+    for test in [messup.rm_omap, messup.add_omap, messup.change_omap,
+                 messup.append, messup.truncate, messup.change_obj,
+                 messup.remove]:
+        with test() as checks:
+            deep_scrub(manager, pg, pool)
+            cmd = 'rados list-inconsistent-pg {pool} ' \
+                  '--format=json'.format(pool=pool)
+            pgs = json.loads(mon.sh(cmd))
+            assert pgs == [pg]
+
+            cmd = 'rados list-inconsistent-obj {pg} ' \
+                  '--format=json'.format(pg=pg)
+            objs = json.loads(mon.sh(cmd))
+            assert len(objs['inconsistents']) == 1
+
+            checker = InconsistentObjChecker(osd_id, acting, obj_name)
+            inc_obj = objs['inconsistents'][0]
+            log.info('inc = %r', inc_obj)
+            checker.basic_checks(inc_obj)
+            for check in checks:
+                checker.run(check, inc_obj)
+
+
+def task(ctx, config):
+    """
+    Test [deep] scrub
+
+    tasks:
+    - chef:
+    - install:
+    - ceph:
+        log-ignorelist:
+        - '!= data_digest'
+        - '!= omap_digest'
+        - '!= size'
+        - deep-scrub 0 missing, 1 inconsistent objects
+        - deep-scrub [0-9]+ errors
+        - repair 0 missing, 1 inconsistent objects
+        - repair [0-9]+ errors, [0-9]+ fixed
+        - shard [0-9]+ .* : missing
+        - deep-scrub 1 missing, 1 inconsistent objects
+        - does not match object info size
+        - attr name mistmatch
+        - deep-scrub 1 missing, 0 inconsistent objects
+        - failed to pick suitable auth object
+        - candidate size [0-9]+ info size [0-9]+ mismatch
+      conf:
+        osd:
+          osd deep scrub update digest min age: 0
+    - scrub_test:
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'scrub_test task only accepts a dict for configuration'
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+
+    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+    log.info('num_osds is %s' % num_osds)
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    while len(manager.get_osd_status()['up']) < num_osds:
+        time.sleep(10)
+
+    for i in range(num_osds):
+        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs',
+                                '--', '--osd-objectstore-fuse')
+    manager.flush_pg_stats(range(num_osds))
+    manager.wait_for_clean()
+
+    # write some data
+    p = manager.do_rados(['bench', '--no-cleanup', '1', 'write', '-b', '4096'], pool='rbd')
+    log.info('err is %d' % p.exitstatus)
+
+    # wait for some PG to have data that we can mess with
+    pg, acting = wait_for_victim_pg(manager)
+    osd = acting[0]
+
+    osd_remote, obj_path, obj_name = find_victim_object(ctx, pg, osd)
+    manager.do_rados(['setomapval', obj_name, 'key', 'val'], pool='rbd')
+    log.info('err is %d' % p.exitstatus)
+    manager.do_rados(['setomapheader', obj_name, 'hdr'], pool='rbd')
+    log.info('err is %d' % p.exitstatus)
+
+    # Update missing digests, requires "osd deep scrub update digest min age: 0"
+    pgnum = get_pgnum(pg)
+    manager.do_pg_scrub('rbd', pgnum, 'deep-scrub')
+
+    log.info('messing with PG %s on osd %d' % (pg, osd))
+    test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path, 'rbd')
+    test_repair_bad_omap(ctx, manager, pg, osd, obj_name)
+    test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd,
+                               obj_name, obj_path)
+    log.info('test successful!')
+
+    # shut down fuse mount
+    for i in range(num_osds):
+        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs',
+                                '--', '--no-osd-objectstore-fuse')
+    time.sleep(5)
+    log.info('done')
diff --git a/qa/tasks/systemd.py b/qa/tasks/systemd.py
new file mode 100644
index 000000000..1728b920f
--- /dev/null
+++ b/qa/tasks/systemd.py
@@ -0,0 +1,135 @@
+"""
+Systemd test
+"""
+import contextlib
+import logging
+import re
+import time
+
+from teuthology.orchestra import run
+from teuthology.misc import reconnect, get_first_mon, wait_until_healthy
+
+log = logging.getLogger(__name__)
+
+def _remote_service_status(remote, service):
+    status = remote.sh('sudo systemctl status %s' % service,
+                       check_status=False)
+    return status
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+      - tasks:
+          ceph-deploy:
+          systemd:
+
+    Test ceph systemd services can start, stop and restart and
+    check for any failed services and report back errors
+    """
+    for remote, roles in ctx.cluster.remotes.items():
+        remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
+                         'grep', 'ceph'])
+        units = remote.sh('sudo systemctl list-units | grep ceph',
+                          check_status=False)
+        log.info(units)
+        if units.find('failed'):
+            log.info("Ceph services in failed state")
+
+        # test overall service stop and start using ceph.target
+        # ceph.target tests are meant for ceph systemd tests
+        # and not actual process testing using 'ps'
+        log.info("Stopping all Ceph services")
+        remote.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
+        status = _remote_service_status(remote, 'ceph.target')
+        log.info(status)
+        log.info("Checking process status")
+        ps_eaf = remote.sh('sudo ps -eaf | grep ceph')
+        if ps_eaf.find('Active: inactive'):
+            log.info("Successfully stopped all ceph services")
+        else:
+            log.info("Failed to stop ceph services")
+
+        log.info("Starting all Ceph services")
+        remote.run(args=['sudo', 'systemctl', 'start', 'ceph.target'])
+        status = _remote_service_status(remote, 'ceph.target')
+        log.info(status)
+        if status.find('Active: active'):
+            log.info("Successfully started all Ceph services")
+        else:
+            log.info("info", "Failed to start Ceph services")
+        ps_eaf = remote.sh('sudo ps -eaf | grep ceph')
+        log.info(ps_eaf)
+        time.sleep(4)
+
+        # test individual services start stop
+        name = remote.shortname
+        mon_name = 'ceph-mon@' + name + '.service'
+        mds_name = 'ceph-mds@' + name + '.service'
+        mgr_name = 'ceph-mgr@' + name + '.service'
+        mon_role_name = 'mon.' + name
+        mds_role_name = 'mds.' + name
+        mgr_role_name = 'mgr.' + name
+        m_osd = re.search('--id (\d+) --setuser ceph', ps_eaf)
+        if m_osd:
+            osd_service = 'ceph-osd@{m}.service'.format(m=m_osd.group(1))
+            remote.run(args=['sudo', 'systemctl', 'status',
+                             osd_service])
+            remote.run(args=['sudo', 'systemctl', 'stop',
+                             osd_service])
+            time.sleep(4)  # immediate check will result in deactivating state
+            status = _remote_service_status(remote, osd_service)
+            log.info(status)
+            if status.find('Active: inactive'):
+                log.info("Successfully stopped single osd ceph service")
+            else:
+                log.info("Failed to stop ceph osd services")
+            remote.sh(['sudo', 'systemctl', 'start', osd_service])
+            time.sleep(4)
+        if mon_role_name in roles:
+            remote.run(args=['sudo', 'systemctl', 'status', mon_name])
+            remote.run(args=['sudo', 'systemctl', 'stop', mon_name])
+            time.sleep(4)  # immediate check will result in deactivating state
+            status = _remote_service_status(remote, mon_name)
+            if status.find('Active: inactive'):
+                log.info("Successfully stopped single mon ceph service")
+            else:
+                log.info("Failed to stop ceph mon service")
+            remote.run(args=['sudo', 'systemctl', 'start', mon_name])
+            time.sleep(4)
+        if mgr_role_name in roles:
+            remote.run(args=['sudo', 'systemctl', 'status', mgr_name])
+            remote.run(args=['sudo', 'systemctl', 'stop', mgr_name])
+            time.sleep(4)  # immediate check will result in deactivating state
+            status = _remote_service_status(remote, mgr_name)
+            if status.find('Active: inactive'):
+                log.info("Successfully stopped single ceph mgr service")
+            else:
+                log.info("Failed to stop ceph mgr service")
+            remote.run(args=['sudo', 'systemctl', 'start', mgr_name])
+            time.sleep(4)
+        if mds_role_name in roles:
+            remote.run(args=['sudo', 'systemctl', 'status', mds_name])
+            remote.run(args=['sudo', 'systemctl', 'stop', mds_name])
+            time.sleep(4)  # immediate check will result in deactivating state
+            status = _remote_service_status(remote, mds_name)
+            if status.find('Active: inactive'):
+                log.info("Successfully stopped single ceph mds service")
+            else:
+                log.info("Failed to stop ceph mds service")
+            remote.run(args=['sudo', 'systemctl', 'start', mds_name])
+            time.sleep(4)
+
+    # reboot all nodes and verify the systemd units restart
+    # workunit that runs would fail if any of the systemd unit doesnt start
+    ctx.cluster.run(args='sudo reboot', wait=False, check_status=False)
+    # avoid immediate reconnect
+    time.sleep(120)
+    reconnect(ctx, 480)  # reconnect all nodes
+    # for debug info
+    ctx.cluster.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
+                          'grep', 'ceph'])
+    # wait for HEALTH_OK
+    mon = get_first_mon(ctx, config)
+    (mon_remote,) = ctx.cluster.only(mon).remotes.keys()
+    wait_until_healthy(ctx, mon_remote, use_sudo=True)
+    yield
diff --git a/qa/tasks/tempest.py b/qa/tasks/tempest.py
new file mode 100644
index 000000000..cee942e37
--- /dev/null
+++ b/qa/tasks/tempest.py
@@ -0,0 +1,263 @@
+"""
+Deploy and configure Tempest for Teuthology
+"""
+import configparser
+import contextlib
+import logging
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.exceptions import ConfigError
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+
+def get_tempest_dir(ctx):
+    return '{tdir}/tempest'.format(tdir=teuthology.get_testdir(ctx))
+
+def run_in_tempest_dir(ctx, client, cmdargs, **kwargs):
+    ctx.cluster.only(client).run(
+        args=[ 'cd', get_tempest_dir(ctx), run.Raw('&&'), ] + cmdargs,
+        **kwargs
+    )
+
+def run_in_tempest_rgw_dir(ctx, client, cmdargs, **kwargs):
+    ctx.cluster.only(client).run(
+        args=[ 'cd', get_tempest_dir(ctx) + '/rgw', run.Raw('&&'), ] + cmdargs,
+        **kwargs
+    )
+
+def run_in_tempest_venv(ctx, client, cmdargs, **kwargs):
+    run_in_tempest_dir(ctx, client,
+                        [   'source',
+                            '.tox/venv/bin/activate',
+                            run.Raw('&&')
+                        ] + cmdargs, **kwargs)
+
+@contextlib.contextmanager
+def download(ctx, config):
+    """
+    Download the Tempest from github.
+    Remove downloaded file upon exit.
+
+    The context passed in should be identical to the context
+    passed in to the main task.
+    """
+    assert isinstance(config, dict)
+    log.info('Downloading Tempest...')
+    for (client, cconf) in config.items():
+        ctx.cluster.only(client).run(
+            args=[
+                'git', 'clone',
+                '-b', cconf.get('force-branch', 'master'),
+                'https://github.com/openstack/tempest.git',
+                get_tempest_dir(ctx)
+            ],
+        )
+
+        sha1 = cconf.get('sha1')
+        if sha1 is not None:
+            run_in_tempest_dir(ctx, client, [ 'git', 'reset', '--hard', sha1 ])
+    try:
+        yield
+    finally:
+        log.info('Removing Tempest...')
+        for client in config:
+            ctx.cluster.only(client).run(
+                args=[ 'rm', '-rf', get_tempest_dir(ctx) ],
+            )
+
+def get_toxvenv_dir(ctx):
+    return ctx.tox.venv_path
+
+@contextlib.contextmanager
+def setup_venv(ctx, config):
+    """
+    Setup the virtualenv for Tempest using tox.
+    """
+    assert isinstance(config, dict)
+    log.info('Setting up virtualenv for Tempest')
+    for (client, _) in config.items():
+        run_in_tempest_dir(ctx, client,
+            [   '{tvdir}/bin/tox'.format(tvdir=get_toxvenv_dir(ctx)),
+                '-e', 'venv', '--notest'
+            ])
+    yield
+
+def setup_logging(ctx, cpar):
+    cpar.set('DEFAULT', 'log_dir', teuthology.get_archive_dir(ctx))
+    cpar.set('DEFAULT', 'log_file', 'tempest.log')
+
+def to_config(config, params, section, cpar):
+    for (k, v) in config[section].items():
+        if isinstance(v, str):
+            v = v.format(**params)
+        elif isinstance(v, bool):
+            v = 'true' if v else 'false'
+        else:
+            v = str(v)
+        cpar.set(section, k, v)
+
+@contextlib.contextmanager
+def configure_instance(ctx, config):
+    assert isinstance(config, dict)
+    log.info('Configuring Tempest')
+
+    for (client, cconfig) in config.items():
+        run_in_tempest_venv(ctx, client,
+            [
+                'tempest',
+                'init',
+                '--workspace-path',
+                get_tempest_dir(ctx) + '/workspace.yaml',
+                'rgw'
+            ])
+
+        # prepare the config file
+        tetcdir = '{tdir}/rgw/etc'.format(tdir=get_tempest_dir(ctx))
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        local_conf = remote.get_file(tetcdir + '/tempest.conf.sample')
+
+        # fill the params dictionary which allows to use templatized configs
+        keystone_role = cconfig.get('use-keystone-role', None)
+        if keystone_role is None \
+            or keystone_role not in ctx.keystone.public_endpoints:
+            raise ConfigError('the use-keystone-role is misconfigured')
+        public_host, public_port = ctx.keystone.public_endpoints[keystone_role]
+        params = {
+            'keystone_public_host': public_host,
+            'keystone_public_port': str(public_port),
+        }
+
+        cpar = configparser.ConfigParser()
+        cpar.read(local_conf)
+        setup_logging(ctx, cpar)
+        to_config(cconfig, params, 'auth', cpar)
+        to_config(cconfig, params, 'identity', cpar)
+        to_config(cconfig, params, 'object-storage', cpar)
+        to_config(cconfig, params, 'object-storage-feature-enabled', cpar)
+        cpar.write(open(local_conf, 'w+'))
+
+        remote.put_file(local_conf, tetcdir + '/tempest.conf')
+    yield
+
+@contextlib.contextmanager
+def run_tempest(ctx, config):
+    assert isinstance(config, dict)
+    log.info('Configuring Tempest')
+
+    for (client, cconf) in config.items():
+        blocklist = cconf.get('blocklist', [])
+        assert isinstance(blocklist, list)
+        run_in_tempest_venv(ctx, client,
+            [
+                'tempest',
+                'run',
+                '--workspace-path',
+                get_tempest_dir(ctx) + '/workspace.yaml',
+                '--workspace',
+                'rgw',
+                '--regex', '^tempest.api.object_storage',
+                '--black-regex', '|'.join(blocklist)
+            ])
+    try:
+        yield
+    finally:
+        pass
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Deploy and run Tempest's object storage campaign
+
+    Example of configuration:
+
+      overrides:
+        ceph:
+          conf:
+            client:
+              rgw keystone api version: 3
+              rgw keystone accepted roles: admin,Member
+              rgw keystone implicit tenants: true
+              rgw keystone accepted admin roles: admin
+              rgw swift enforce content length: true
+              rgw swift account in url: true
+              rgw swift versioning enabled: true
+              rgw keystone admin domain: Default
+              rgw keystone admin user: admin
+              rgw keystone admin password: ADMIN
+              rgw keystone admin project: admin
+      tasks:
+      # typically, the task should be preceded with install, ceph, tox,
+      # keystone and rgw. Tox and Keystone are specific requirements
+      # of tempest.py.
+      - rgw:
+          # it's important to match the prefix with the endpoint's URL
+          # in Keystone. Additionally, if we want to test /info and its
+          # accompanying stuff, the whole Swift API must be put in root
+          # of the whole URL  hierarchy (read: frontend_prefix == /swift).
+          frontend_prefix: /swift
+          client.0:
+            use-keystone-role: client.0
+      - tempest:
+          client.0:
+            force-branch: master
+            use-keystone-role: client.0
+            auth:
+              admin_username: admin
+              admin_project_name: admin
+              admin_password: ADMIN
+              admin_domain_name: Default
+            identity:
+              uri: http://{keystone_public_host}:{keystone_public_port}/v2.0/
+              uri_v3: http://{keystone_public_host}:{keystone_public_port}/v3/
+              admin_role: admin
+            object-storage:
+              reseller_admin_role: admin
+            object-storage-feature-enabled:
+              container_sync: false
+              discoverability: false
+            blocklist:
+              # please strip half of these items after merging PRs #15369
+              # and #12704
+              - .*test_list_containers_reverse_order.*
+              - .*test_list_container_contents_with_end_marker.*
+              - .*test_delete_non_empty_container.*
+              - .*test_container_synchronization.*
+              - .*test_get_object_after_expiration_time.*
+              - .*test_create_object_with_transfer_encoding.*
+    """
+    assert config is None or isinstance(config, list) \
+        or isinstance(config, dict), \
+        'task tempest only supports a list or dictionary for configuration'
+
+    if not ctx.tox:
+        raise ConfigError('tempest must run after the tox task')
+    if not ctx.keystone:
+        raise ConfigError('tempest must run after the keystone task')
+
+    all_clients = ['client.{id}'.format(id=id_)
+                   for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    if config is None:
+        config = all_clients
+    if isinstance(config, list):
+        config = dict.fromkeys(config)
+
+    overrides = ctx.config.get('overrides', {})
+    # merge each client section, not the top level.
+    for client in config.keys():
+        if not config[client]:
+            config[client] = {}
+        teuthology.deep_merge(config[client], overrides.get('keystone', {}))
+
+    log.debug('Tempest config is %s', config)
+
+    with contextutil.nested(
+        lambda: download(ctx=ctx, config=config),
+        lambda: setup_venv(ctx=ctx, config=config),
+        lambda: configure_instance(ctx=ctx, config=config),
+        lambda: run_tempest(ctx=ctx, config=config),
+        ):
+        yield
diff --git a/qa/tasks/tests/__init__.py b/qa/tasks/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/qa/tasks/tests/__init__.py
diff --git a/qa/tasks/tests/test_cephadm.py b/qa/tasks/tests/test_cephadm.py
new file mode 100644
index 000000000..403d1915e
--- /dev/null
+++ b/qa/tasks/tests/test_cephadm.py
@@ -0,0 +1,70 @@
+from tasks import cephadm
+
+v1 = """
+[registries.search]
+registries = ['registry.access.redhat.com', 'registry.redhat.io', 'docker.io', 'quay.io']
+
+[registries.insecure]
+registries = []
+"""
+
+v2 = """
+unqualified-search-registries = ["registry.access.redhat.com", "registry.redhat.io", "docker.io", 'quay.io']
+
+[[registry]]
+prefix = "registry.access.redhat.com"
+location = "registry.access.redhat.com"
+insecure = false
+blocked = false
+
+[[registry]]
+prefix = "registry.redhat.io"
+location = "registry.redhat.io"
+insecure = false
+blocked = false
+
+[[registry]]
+prefix = "docker.io"
+location = "docker.io"
+insecure = false
+blocked = false
+
+[[registry.mirror]]
+location = "vossi04.front.sepia.ceph.com:5000"
+insecure = true
+
+[[registry]]
+prefix = "quay.io"
+location = "quay.io"
+insecure = false
+blocked = false
+"""
+
+expected = {
+    'unqualified-search-registries': ['registry.access.redhat.com', 'registry.redhat.io',
+                                      'docker.io', 'quay.io'],
+    'registry': [
+        {'prefix': 'registry.access.redhat.com',
+         'location': 'registry.access.redhat.com',
+         'insecure': False,
+         'blocked': False},
+        {'prefix': 'registry.redhat.io',
+         'location': 'registry.redhat.io',
+         'insecure': False,
+         'blocked': False},
+        {'prefix': 'docker.io',
+         'location': 'docker.io',
+         'insecure': False,
+         'blocked': False,
+         'mirror': [{'location': 'vossi04.front.sepia.ceph.com:5000',
+                     'insecure': True}]},
+        {'prefix': 'quay.io',
+         'location': 'quay.io',
+         'insecure': False,
+         'blocked': False}
+    ]
+}
+
+def test_add_mirror():
+    assert cephadm.registries_add_mirror_to_docker_io(v1, 'vossi04.front.sepia.ceph.com:5000') == expected
+    assert cephadm.registries_add_mirror_to_docker_io(v2, 'vossi04.front.sepia.ceph.com:5000') == expected
diff --git a/qa/tasks/tests/test_devstack.py b/qa/tasks/tests/test_devstack.py
new file mode 100644
index 000000000..39b94a64c
--- /dev/null
+++ b/qa/tasks/tests/test_devstack.py
@@ -0,0 +1,48 @@
+from textwrap import dedent
+
+from tasks import devstack
+
+
+class TestDevstack(object):
+    def test_parse_os_table(self):
+        table_str = dedent("""
+            +---------------------+--------------------------------------+
+            |       Property      |                Value                 |
+            +---------------------+--------------------------------------+
+            |     attachments     |                  []                  |
+            |  availability_zone  |                 nova                 |
+            |       bootable      |                false                 |
+            |      created_at     |      2014-02-21T17:14:47.548361      |
+            | display_description |                 None                 |
+            |     display_name    |                 NAME                 |
+            |          id         | ffdbd1bb-60dc-4d95-acfe-88774c09ad3e |
+            |       metadata      |                  {}                  |
+            |         size        |                  1                   |
+            |     snapshot_id     |                 None                 |
+            |     source_volid    |                 None                 |
+            |        status       |               creating               |
+            |     volume_type     |                 None                 |
+            +---------------------+--------------------------------------+
+            """).strip()
+        expected = {
+            'Property': 'Value',
+            'attachments': '[]',
+            'availability_zone': 'nova',
+            'bootable': 'false',
+            'created_at': '2014-02-21T17:14:47.548361',
+            'display_description': 'None',
+            'display_name': 'NAME',
+            'id': 'ffdbd1bb-60dc-4d95-acfe-88774c09ad3e',
+            'metadata': '{}',
+            'size': '1',
+            'snapshot_id': 'None',
+            'source_volid': 'None',
+            'status': 'creating',
+            'volume_type': 'None'}
+
+        vol_info = devstack.parse_os_table(table_str)
+        assert vol_info == expected
+
+
+
+
diff --git a/qa/tasks/tests/test_radosgw_admin.py b/qa/tasks/tests/test_radosgw_admin.py
new file mode 100644
index 000000000..4e86d5c2b
--- /dev/null
+++ b/qa/tasks/tests/test_radosgw_admin.py
@@ -0,0 +1,31 @@
+from unittest.mock import Mock
+
+from tasks import radosgw_admin
+
+acl_with_version = """<?xml version="1.0" encoding="UTF-8"?><AccessControlPolicy xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Owner><ID>foo</ID><DisplayName>Foo</DisplayName></Owner><AccessControlList><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>foo</ID><DisplayName>Foo</DisplayName></Grantee><Permission>FULL_CONTROL</Permission></Grant></AccessControlList></AccessControlPolicy>
+"""  # noqa
+
+
+acl_without_version = """<AccessControlPolicy xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Owner><ID>foo</ID><DisplayName>Foo</DisplayName></Owner><AccessControlList><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>foo</ID><DisplayName>Foo</DisplayName></Grantee><Permission>FULL_CONTROL</Permission></Grant></AccessControlList></AccessControlPolicy>
+"""  # noqa
+
+
+class TestGetAcl(object):
+
+    def setup(self):
+        self.key = Mock()
+
+    def test_removes_xml_version(self):
+        self.key.get_xml_acl = Mock(return_value=acl_with_version)
+        result = radosgw_admin.get_acl(self.key)
+        assert result.startswith('<AccessControlPolicy')
+
+    def test_xml_version_is_already_removed(self):
+        self.key.get_xml_acl = Mock(return_value=acl_without_version)
+        result = radosgw_admin.get_acl(self.key)
+        assert result.startswith('<AccessControlPolicy')
+
+    def test_newline_gets_trimmed(self):
+        self.key.get_xml_acl = Mock(return_value=acl_without_version)
+        result = radosgw_admin.get_acl(self.key)
+        assert result.endswith('\n') is False
diff --git a/qa/tasks/teuthology_integration.py b/qa/tasks/teuthology_integration.py
new file mode 100644
index 000000000..b5a2278eb
--- /dev/null
+++ b/qa/tasks/teuthology_integration.py
@@ -0,0 +1,19 @@
+import logging
+from teuthology import misc
+from teuthology.task import Task
+
+log = logging.getLogger(__name__)
+
+
+class TeuthologyIntegration(Task):
+
+    def begin(self):
+        misc.sh("""
+        set -x
+        pip install tox
+        tox
+        # tox -e py27-integration
+        tox -e openstack-integration
+        """)
+
+task = TeuthologyIntegration
diff --git a/qa/tasks/tgt.py b/qa/tasks/tgt.py
new file mode 100644
index 000000000..a0758f472
--- /dev/null
+++ b/qa/tasks/tgt.py
@@ -0,0 +1,177 @@
+"""
+Task to handle tgt
+
+Assumptions made:
+    The ceph-extras tgt package may need to get installed.
+    The open-iscsi package needs to get installed.
+"""
+import logging
+import contextlib
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def start_tgt_remotes(ctx, start_tgtd):
+    """
+    This subtask starts up a tgtd on the clients specified
+    """
+    remotes = ctx.cluster.only(teuthology.is_type('client')).remotes
+    tgtd_list = []
+    for rem, roles in remotes.items():
+        for _id in roles:
+            if _id in start_tgtd:
+                if not rem in tgtd_list:
+                    tgtd_list.append(rem)
+                    size = ctx.config.get('image_size', 10240)
+                    rem.run(
+                        args=[
+                            'rbd',
+                            'create',
+                            'iscsi-image',
+                            '--size',
+                            str(size),
+                    ])
+                    rem.run(
+                        args=[
+                            'sudo',
+                            'tgtadm',
+                            '--lld',
+                            'iscsi',
+                            '--mode',
+                            'target',
+                            '--op',
+                            'new',
+                            '--tid',
+                            '1',
+                            '--targetname',
+                            'rbd',
+                        ])
+                    rem.run(
+                        args=[
+                            'sudo',
+                            'tgtadm',
+                            '--lld',
+                            'iscsi',
+                            '--mode',
+                            'logicalunit',
+                            '--op',
+                            'new',
+                            '--tid',
+                            '1',
+                            '--lun',
+                            '1',
+                            '--backing-store',
+                            'iscsi-image',
+                            '--bstype',
+                            'rbd',
+                        ])
+                    rem.run(
+                        args=[
+                            'sudo',
+                            'tgtadm',
+                            '--lld',
+                            'iscsi',
+                            '--op',
+                            'bind',
+                            '--mode',
+                            'target',
+                            '--tid',
+                            '1',
+                            '-I',
+                            'ALL',
+                        ])
+    try:
+        yield
+
+    finally:
+        for rem in tgtd_list:
+            rem.run(
+                args=[
+                    'sudo',
+                    'tgtadm',
+                    '--lld',
+                    'iscsi',
+                    '--mode',
+                    'target',
+                    '--op',
+                    'delete',
+                    '--force',
+                    '--tid',
+                    '1',
+                ])
+            rem.run(
+                args=[
+                    'rbd',
+                    'snap',
+                    'purge',
+                    'iscsi-image',
+                ])
+            rem.run(
+                args=[
+                    'sudo',
+                    'rbd',
+                    'rm',
+                    'iscsi-image',
+                ])
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Start up tgt.
+
+    To start on on all clients::
+
+        tasks:
+        - ceph:
+        - tgt:
+
+    To start on certain clients::
+
+        tasks:
+        - ceph:
+        - tgt: [client.0, client.3]
+
+    or
+
+        tasks:
+        - ceph:
+        - tgt:
+            client.0:
+            client.3:
+
+    An image blocksize size can also be specified::
+        
+        tasks:
+        - ceph:
+        - tgt:
+            image_size = 20480
+
+    The general flow of things here is:
+        1. Find clients on which tgt is supposed to run (start_tgtd)
+        2. Remotely start up tgt daemon
+    On cleanup:
+        3. Stop tgt daemon
+
+    The iscsi administration is handled by the iscsi task.
+    """
+    if config:
+        config = {key : val for key, val in config.items()
+                if key.startswith('client')}
+    # config at this point should only contain keys starting with 'client'
+    start_tgtd = []
+    remotes = ctx.cluster.only(teuthology.is_type('client')).remotes
+    log.info(remotes)
+    if not config:
+        start_tgtd = ['client.{id}'.format(id=id_)
+            for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    else:
+        start_tgtd = config
+    log.info(start_tgtd)
+    with contextutil.nested(
+            lambda: start_tgt_remotes(ctx=ctx, start_tgtd=start_tgtd),):
+        yield
diff --git a/qa/tasks/thrash_pool_snaps.py b/qa/tasks/thrash_pool_snaps.py
new file mode 100644
index 000000000..c71c9ce8d
--- /dev/null
+++ b/qa/tasks/thrash_pool_snaps.py
@@ -0,0 +1,61 @@
+"""
+Thrash -- Simulate random osd failures.
+"""
+import contextlib
+import logging
+import gevent
+import time
+import random
+
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    "Thrash" snap creation and removal on the listed pools
+
+    Example:
+
+    thrash_pool_snaps:
+      pools: [.rgw.buckets, .rgw.buckets.index]
+      max_snaps: 10
+      min_snaps: 5
+      period: 10
+    """
+    stopping = False
+    def do_thrash():
+        pools = config.get('pools', [])
+        max_snaps = config.get('max_snaps', 10)
+        min_snaps = config.get('min_snaps', 5)
+        period = config.get('period', 30)
+        snaps = []
+        manager = ctx.managers['ceph']
+        def remove_snap():
+            assert len(snaps) > 0
+            snap = random.choice(snaps)
+            log.info("Removing snap %s" % (snap,))
+            for pool in pools:
+                manager.remove_pool_snap(pool, str(snap))
+            snaps.remove(snap)
+        def add_snap(snap):
+            log.info("Adding snap %s" % (snap,))
+            for pool in pools:
+                manager.add_pool_snap(pool, str(snap))
+            snaps.append(snap)
+        index = 0
+        while not stopping:
+            index += 1
+            time.sleep(period)
+            if len(snaps) <= min_snaps:
+                add_snap(index)
+            elif len(snaps) >= max_snaps:
+                remove_snap()
+            else:
+                random.choice([lambda: add_snap(index), remove_snap])()
+        log.info("Stopping")
+    thread = gevent.spawn(do_thrash)
+    yield
+    stopping = True
+    thread.join()
+
diff --git a/qa/tasks/thrasher.py b/qa/tasks/thrasher.py
new file mode 100644
index 000000000..0ea1bf0ee
--- /dev/null
+++ b/qa/tasks/thrasher.py
@@ -0,0 +1,15 @@
+"""
+Thrasher base class
+"""
+class Thrasher(object):
+
+    def __init__(self):
+        super(Thrasher, self).__init__()
+        self._exception = None
+
+    @property
+    def exception(self):
+        return self._exception
+
+    def set_thrasher_exception(self, e):
+        self._exception = e
diff --git a/qa/tasks/thrashosds-health.yaml b/qa/tasks/thrashosds-health.yaml
new file mode 100644
index 000000000..1b2560d4e
--- /dev/null
+++ b/qa/tasks/thrashosds-health.yaml
@@ -0,0 +1,18 @@
+overrides:
+  ceph:
+    conf:
+      osd:
+        osd max markdown count: 1000
+    log-ignorelist:
+      - overall HEALTH_
+      - \(OSDMAP_FLAGS\)
+      - \(OSD_
+      - \(PG_
+      - \(POOL_
+      - \(CACHE_POOL_
+      - \(SMALLER_PGP_NUM\)
+      - \(OBJECT_
+      - \(SLOW_OPS\)
+      - \(REQUEST_SLOW\)
+      - \(TOO_FEW_PGS\)
+      - slow request
diff --git a/qa/tasks/thrashosds.py b/qa/tasks/thrashosds.py
new file mode 100644
index 000000000..aa7ec437a
--- /dev/null
+++ b/qa/tasks/thrashosds.py
@@ -0,0 +1,221 @@
+"""
+Thrash -- Simulate random osd failures.
+"""
+import contextlib
+import logging
+from tasks import ceph_manager
+from teuthology import misc as teuthology
+
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    "Thrash" the OSDs by randomly marking them out/down (and then back
+    in) until the task is ended. This loops, and every op_delay
+    seconds it randomly chooses to add or remove an OSD (even odds)
+    unless there are fewer than min_out OSDs out of the cluster, or
+    more than min_in OSDs in the cluster.
+
+    All commands are run on mon0 and it stops when __exit__ is called.
+
+    The config is optional, and is a dict containing some or all of:
+
+    cluster: (default 'ceph') the name of the cluster to thrash
+
+    min_in: (default 4) the minimum number of OSDs to keep in the
+       cluster
+
+    min_out: (default 0) the minimum number of OSDs to keep out of the
+       cluster
+
+    op_delay: (5) the length of time to sleep between changing an
+       OSD's status
+
+    min_dead: (0) minimum number of osds to leave down/dead.
+
+    max_dead: (0) maximum number of osds to leave down/dead before waiting
+       for clean.  This should probably be num_replicas - 1.
+
+    clean_interval: (60) the approximate length of time to loop before
+       waiting until the cluster goes clean. (In reality this is used
+       to probabilistically choose when to wait, and the method used
+       makes it closer to -- but not identical to -- the half-life.)
+
+    scrub_interval: (-1) the approximate length of time to loop before
+       waiting until a scrub is performed while cleaning. (In reality
+       this is used to probabilistically choose when to wait, and it
+       only applies to the cases where cleaning is being performed).
+       -1 is used to indicate that no scrubbing will be done.
+
+    chance_down: (0.4) the probability that the thrasher will mark an
+       OSD down rather than marking it out. (The thrasher will not
+       consider that OSD out of the cluster, since presently an OSD
+       wrongly marked down will mark itself back up again.) This value
+       can be either an integer (eg, 75) or a float probability (eg
+       0.75).
+
+    chance_test_min_size: (0) chance to run test_pool_min_size,
+       which:
+       - kills all but one osd
+       - waits
+       - kills that osd
+       - revives all other osds
+       - verifies that the osds fully recover
+
+    timeout: (360) the number of seconds to wait for the cluster
+       to become clean after each cluster change. If this doesn't
+       happen within the timeout, an exception will be raised.
+
+    revive_timeout: (150) number of seconds to wait for an osd asok to
+       appear after attempting to revive the osd
+
+    thrash_primary_affinity: (true) randomly adjust primary-affinity
+
+    chance_pgnum_grow: (0) chance to increase a pool's size
+    chance_pgpnum_fix: (0) chance to adjust pgpnum to pg for a pool
+    pool_grow_by: (10) amount to increase pgnum by
+    chance_pgnum_shrink: (0) chance to decrease a pool's size
+    pool_shrink_by: (10) amount to decrease pgnum by
+    max_pgs_per_pool_osd: (1200) don't expand pools past this size per osd
+
+    pause_short: (3) duration of short pause
+    pause_long: (80) duration of long pause
+    pause_check_after: (50) assert osd down after this long
+    chance_inject_pause_short: (1) chance of injecting short stall
+    chance_inject_pause_long: (0) chance of injecting long stall
+
+    clean_wait: (0) duration to wait before resuming thrashing once clean
+
+    sighup_delay: (0.1) duration to delay between sending signal.SIGHUP to a
+                  random live osd
+
+    powercycle: (false) whether to power cycle the node instead
+        of just the osd process. Note that this assumes that a single
+        osd is the only important process on the node.
+
+    bdev_inject_crash: (0) seconds to delay while inducing a synthetic crash.
+        the delay lets the BlockDevice "accept" more aio operations but blocks
+        any flush, and then eventually crashes (losing some or all ios).  If 0,
+        no bdev failure injection is enabled.
+
+    bdev_inject_crash_probability: (.5) probability of doing a bdev failure
+        injection crash vs a normal OSD kill.
+
+    chance_test_backfill_full: (0) chance to simulate full disks stopping
+        backfill
+
+    chance_test_map_discontinuity: (0) chance to test map discontinuity
+    map_discontinuity_sleep_time: (40) time to wait for map trims
+
+    ceph_objectstore_tool: (true) whether to export/import a pg while an osd is down
+    chance_move_pg: (1.0) chance of moving a pg if more than 1 osd is down (default 100%)
+
+    optrack_toggle_delay: (2.0) duration to delay between toggling op tracker
+                  enablement to all osds
+
+    dump_ops_enable: (true) continuously dump ops on all live osds
+
+    noscrub_toggle_delay: (2.0) duration to delay between toggling noscrub
+
+    disable_objectstore_tool_tests: (false) disable ceph_objectstore_tool based
+                                    tests
+
+    chance_thrash_cluster_full: .05
+
+    chance_thrash_pg_upmap: 1.0
+    chance_thrash_pg_upmap_items: 1.0
+
+    aggressive_pg_num_changes: (true)  whether we should bypass the careful throttling of pg_num and pgp_num changes in mgr's adjust_pgs() controller
+
+    example:
+
+    tasks:
+    - ceph:
+    - thrashosds:
+        cluster: ceph
+        chance_down: 10
+        op_delay: 3
+        min_in: 1
+        timeout: 600
+    - interactive:
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'thrashosds task only accepts a dict for configuration'
+    # add default value for sighup_delay
+    config['sighup_delay'] = config.get('sighup_delay', 0.1)
+    # add default value for optrack_toggle_delay
+    config['optrack_toggle_delay'] = config.get('optrack_toggle_delay', 2.0)
+    # add default value for dump_ops_enable
+    config['dump_ops_enable'] = config.get('dump_ops_enable', "true")
+    # add default value for noscrub_toggle_delay
+    config['noscrub_toggle_delay'] = config.get('noscrub_toggle_delay', 2.0)
+    # add default value for random_eio
+    config['random_eio'] = config.get('random_eio', 0.0)
+    aggro = config.get('aggressive_pg_num_changes', True)
+
+    log.info("config is {config}".format(config=str(config)))
+
+    overrides = ctx.config.get('overrides', {})
+    log.info("overrides is {overrides}".format(overrides=str(overrides)))
+    teuthology.deep_merge(config, overrides.get('thrashosds', {}))
+    cluster = config.get('cluster', 'ceph')
+
+    log.info("config is {config}".format(config=str(config)))
+
+    if 'powercycle' in config:
+
+        # sync everyone first to avoid collateral damage to / etc.
+        log.info('Doing preliminary sync to avoid collateral damage...')
+        ctx.cluster.run(args=['sync'])
+
+        if 'ipmi_user' in ctx.teuthology_config:
+            for remote in ctx.cluster.remotes.keys():
+                log.debug('checking console status of %s' % remote.shortname)
+                if not remote.console.check_status():
+                    log.warning('Failed to get console status for %s',
+                             remote.shortname)
+
+            # check that all osd remotes have a valid console
+            osds = ctx.cluster.only(teuthology.is_type('osd', cluster))
+            for remote in osds.remotes.keys():
+                if not remote.console.has_ipmi_credentials:
+                    raise Exception(
+                        'IPMI console required for powercycling, '
+                        'but not available on osd role: {r}'.format(
+                            r=remote.name))
+
+    cluster_manager = ctx.managers[cluster]
+    for f in ['powercycle', 'bdev_inject_crash']:
+        if config.get(f):
+            cluster_manager.config[f] = config.get(f)
+
+    if aggro:
+        cluster_manager.raw_cluster_cmd(
+            'config', 'set', 'mgr',
+            'mgr_debug_aggressive_pg_num_changes',
+            'true')
+
+    log.info('Beginning thrashosds...')
+    thrash_proc = ceph_manager.OSDThrasher(
+        cluster_manager,
+        config,
+        "OSDThrasher",
+        logger=log.getChild('thrasher')
+        )
+    ctx.ceph[cluster].thrashers.append(thrash_proc)
+    try:
+        yield
+    finally:
+        log.info('joining thrashosds')
+        thrash_proc.do_join()
+        cluster_manager.wait_for_all_osds_up()
+        cluster_manager.flush_all_pg_stats()
+        cluster_manager.wait_for_recovery(config.get('timeout', 360))
+        if aggro:
+            cluster_manager.raw_cluster_cmd(
+                'config', 'rm', 'mgr',
+                'mgr_debug_aggressive_pg_num_changes')
diff --git a/qa/tasks/tox.py b/qa/tasks/tox.py
new file mode 100644
index 000000000..61c5b7411
--- /dev/null
+++ b/qa/tasks/tox.py
@@ -0,0 +1,50 @@
+import argparse
+import contextlib
+import logging
+
+from teuthology import misc as teuthology
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+
+def get_toxvenv_dir(ctx):
+    return '{tdir}/tox-venv'.format(tdir=teuthology.get_testdir(ctx))
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Deploy tox from pip. It's a dependency for both Keystone and Tempest.
+    """
+    assert config is None or isinstance(config, list) \
+        or isinstance(config, dict), \
+        "task tox only supports a list or dictionary for configuration"
+    all_clients = ['client.{id}'.format(id=id_)
+                   for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    if config is None:
+        config = all_clients
+    if isinstance(config, list):
+        config = dict.fromkeys(config)
+
+    log.info('Deploying tox from pip...')
+    for (client, _) in config.items():
+        # yup, we have to deploy tox first. The packaged one, available
+        # on Sepia's Ubuntu machines, is outdated for Keystone/Tempest.
+        tvdir = get_toxvenv_dir(ctx)
+        ctx.cluster.only(client).run(args=['python3', '-m', 'venv', tvdir])
+        ctx.cluster.only(client).run(args=[
+            'source', '{tvdir}/bin/activate'.format(tvdir=tvdir),
+            run.Raw('&&'),
+            'pip', 'install', 'tox==3.15.0'
+        ])
+
+    # export the path Keystone and Tempest
+    ctx.tox = argparse.Namespace()
+    ctx.tox.venv_path = get_toxvenv_dir(ctx)
+
+    try:
+        yield
+    finally:
+        for (client, _) in config.items():
+            ctx.cluster.only(client).run(
+                args=[ 'rm', '-rf', get_toxvenv_dir(ctx) ])
diff --git a/qa/tasks/userdata_setup.yaml b/qa/tasks/userdata_setup.yaml
new file mode 100644
index 000000000..afcc08e22
--- /dev/null
+++ b/qa/tasks/userdata_setup.yaml
@@ -0,0 +1,36 @@
+#cloud-config-archive
+
+- type: text/cloud-config
+  content: |
+    output:
+      all: '| tee -a /var/log/cloud-init-output.log'
+
+# allow passwordless access for debugging
+- |
+  #!/usr/bin/env bash
+  exec passwd -d ubuntu
+
+- |
+  #!/usr/bin/env bash
+
+  # mount a NFS share for storing logs
+  sed -i 's/archive.ubuntu.com/old-releases.ubuntu.com/' /etc/apt/sources.list
+  sed -i 's/security.ubuntu.com/old-releases.ubuntu.com/' /etc/apt/sources.list
+  apt-get update
+
+  # DST Root CA X3 certificate expired on Sep 30, 2021.  It was used by
+  # Let's Encrypt, which is what git.ceph.com relies on for HTTPS.  Get the
+  # new Let's Encrypt root certificate in place and deactivate the old one
+  # (lines that begin with "!" are deselected).
+  apt-get install --only-upgrade ca-certificates libssl1.0.0
+  sed -i 's/mozilla\/DST_Root_CA_X3\.crt/!mozilla\/DST_Root_CA_X3\.crt/' /etc/ca-certificates.conf
+  update-ca-certificates
+
+  apt-get -y install nfs-common
+  mkdir /mnt/log
+  # 10.0.2.2 is the host
+  mount -v -t nfs -o proto=tcp 10.0.2.2:{mnt_dir} /mnt/log
+
+  # mount the iso image that has the test script
+  mkdir /mnt/cdrom
+  mount -t auto /dev/cdrom /mnt/cdrom
diff --git a/qa/tasks/userdata_teardown.yaml b/qa/tasks/userdata_teardown.yaml
new file mode 100644
index 000000000..731d769f0
--- /dev/null
+++ b/qa/tasks/userdata_teardown.yaml
@@ -0,0 +1,11 @@
+- |
+  #!/usr/bin/env bash
+  cp /var/log/cloud-init-output.log /mnt/log
+
+- |
+  #!/usr/bin/env bash
+  umount /mnt/log
+
+- |
+  #!/usr/bin/env bash
+  shutdown -h -P now
diff --git a/qa/tasks/util/__init__.py b/qa/tasks/util/__init__.py
new file mode 100644
index 000000000..5b8575ed9
--- /dev/null
+++ b/qa/tasks/util/__init__.py
@@ -0,0 +1,26 @@
+from teuthology import misc
+
+def get_remote(ctx, cluster, service_type, service_id):
+    """
+    Get the Remote for the host where a particular role runs.
+
+    :param cluster: name of the cluster the service is part of
+    :param service_type: e.g. 'mds', 'osd', 'client'
+    :param service_id: The third part of a role, e.g. '0' for
+                       the role 'ceph.client.0'
+    :return: a Remote instance for the host where the
+             requested role is placed
+    """
+    def _is_instance(role):
+        role_tuple = misc.split_role(role)
+        return role_tuple == (cluster, service_type, str(service_id))
+    try:
+        (remote,) = ctx.cluster.only(_is_instance).remotes.keys()
+    except ValueError:
+        raise KeyError("Service {0}.{1}.{2} not found".format(cluster,
+                                                              service_type,
+                                                              service_id))
+    return remote
+
+def get_remote_for_role(ctx, role):
+    return get_remote(ctx, *misc.split_role(role))
diff --git a/qa/tasks/util/rados.py b/qa/tasks/util/rados.py
new file mode 100644
index 000000000..a0c54ce4e
--- /dev/null
+++ b/qa/tasks/util/rados.py
@@ -0,0 +1,87 @@
+import logging
+
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+def rados(ctx, remote, cmd, wait=True, check_status=False):
+    testdir = teuthology.get_testdir(ctx)
+    log.info("rados %s" % ' '.join(cmd))
+    pre = [
+        'adjust-ulimits',
+        'ceph-coverage',
+        '{tdir}/archive/coverage'.format(tdir=testdir),
+        'rados',
+        ];
+    pre.extend(cmd)
+    proc = remote.run(
+        args=pre,
+        check_status=check_status,
+        wait=wait,
+        )
+    if wait:
+        return proc.exitstatus
+    else:
+        return proc
+
+def create_ec_pool(remote, name, profile_name, pgnum, profile={}, cluster_name="ceph", application=None):
+    remote.run(args=['sudo', 'ceph'] +
+               cmd_erasure_code_profile(profile_name, profile) + ['--cluster', cluster_name])
+    remote.run(args=[
+        'sudo', 'ceph', 'osd', 'pool', 'create', name,
+        str(pgnum), str(pgnum), 'erasure', profile_name, '--cluster', cluster_name
+        ])
+    if application:
+        remote.run(args=[
+            'sudo', 'ceph', 'osd', 'pool', 'application', 'enable', name, application, '--cluster', cluster_name
+        ], check_status=False) # may fail as EINVAL when run in jewel upgrade test
+
+def create_replicated_pool(remote, name, pgnum, cluster_name="ceph", application=None):
+    remote.run(args=[
+        'sudo', 'ceph', 'osd', 'pool', 'create', name, str(pgnum), str(pgnum), '--cluster', cluster_name
+        ])
+    if application:
+        remote.run(args=[
+            'sudo', 'ceph', 'osd', 'pool', 'application', 'enable', name, application, '--cluster', cluster_name
+        ], check_status=False)
+
+def create_cache_pool(remote, base_name, cache_name, pgnum, size, cluster_name="ceph"):
+    remote.run(args=[
+        'sudo', 'ceph', 'osd', 'pool', 'create', cache_name, str(pgnum), '--cluster', cluster_name
+    ])
+    remote.run(args=[
+        'sudo', 'ceph', 'osd', 'tier', 'add-cache', base_name, cache_name,
+        str(size), '--cluster', cluster_name
+    ])
+
+def cmd_erasure_code_profile(profile_name, profile):
+    """
+    Return the shell command to run to create the erasure code profile
+    described by the profile parameter.
+    
+    :param profile_name: a string matching [A-Za-z0-9-_.]+
+    :param profile: a map whose semantic depends on the erasure code plugin
+    :returns: a shell command as an array suitable for Remote.run
+
+    If profile is {}, it is replaced with 
+
+      { 'k': '2', 'm': '1', 'crush-failure-domain': 'osd'}
+
+    for backward compatibility. In previous versions of teuthology,
+    these values were hardcoded as function arguments and some yaml
+    files were designed with these implicit values. The teuthology
+    code should not know anything about the erasure code profile
+    content or semantic. The valid values and parameters are outside
+    its scope.
+    """
+
+    if profile == {}:
+        profile = {
+            'k': '2',
+            'm': '1',
+            'crush-failure-domain': 'osd'
+        }
+    return [
+        'osd', 'erasure-code-profile', 'set',
+        profile_name
+        ] + [ str(key) + '=' + str(value) for key, value in profile.items() ]
diff --git a/qa/tasks/util/rgw.py b/qa/tasks/util/rgw.py
new file mode 100644
index 000000000..c955f3150
--- /dev/null
+++ b/qa/tasks/util/rgw.py
@@ -0,0 +1,94 @@
+import logging
+import json
+import time
+
+from io import StringIO
+
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+def rgwadmin(ctx, client, cmd, stdin=StringIO(), check_status=False,
+             format='json', decode=True, log_level=logging.DEBUG):
+    log.info('rgwadmin: {client} : {cmd}'.format(client=client,cmd=cmd))
+    testdir = teuthology.get_testdir(ctx)
+    cluster_name, daemon_type, client_id = teuthology.split_role(client)
+    client_with_id = daemon_type + '.' + client_id
+    pre = [
+        'adjust-ulimits',
+        'ceph-coverage',
+        '{tdir}/archive/coverage'.format(tdir=testdir),
+        'radosgw-admin',
+        '--log-to-stderr',
+        '--format', format,
+        '-n',  client_with_id,
+        '--cluster', cluster_name,
+        ]
+    pre.extend(cmd)
+    log.log(log_level, 'rgwadmin: cmd=%s' % pre)
+    (remote,) = ctx.cluster.only(client).remotes.keys()
+    proc = remote.run(
+        args=pre,
+        check_status=check_status,
+        stdout=StringIO(),
+        stderr=StringIO(),
+        stdin=stdin,
+        )
+    r = proc.exitstatus
+    out = proc.stdout.getvalue()
+    if not decode:
+        return (r, out)
+    j = None
+    if not r and out != '':
+        try:
+            j = json.loads(out)
+            log.log(log_level, ' json result: %s' % j)
+        except ValueError:
+            j = out
+            log.log(log_level, ' raw result: %s' % j)
+    return (r, j)
+
+def get_user_summary(out, user):
+    """Extract the summary for a given user"""
+    user_summary = None
+    for summary in out['summary']:
+        if summary.get('user') == user:
+            user_summary = summary
+
+    if not user_summary:
+        raise AssertionError('No summary info found for user: %s' % user)
+
+    return user_summary
+
+def get_user_successful_ops(out, user):
+    summary = out['summary']
+    if len(summary) == 0:
+        return 0
+    return get_user_summary(out, user)['total']['successful_ops']
+
+def wait_for_radosgw(url, remote):
+    """ poll the given url until it starts accepting connections
+
+    add_daemon() doesn't wait until radosgw finishes startup, so this is used
+    to avoid racing with later tasks that expect radosgw to be up and listening
+    """
+    # TODO: use '--retry-connrefused --retry 8' when teuthology is running on
+    # Centos 8 and other OS's with an updated version of curl
+    curl_cmd = ['curl',
+                url]
+    exit_status = 0
+    num_retries = 8
+    for seconds in range(num_retries):
+        proc = remote.run(
+            args=curl_cmd,
+            check_status=False,
+            stdout=StringIO(),
+            stderr=StringIO(),
+            stdin=StringIO(),
+            )
+        exit_status = proc.exitstatus
+        if exit_status == 0:
+            break
+        time.sleep(2**seconds)
+
+    assert exit_status == 0
diff --git a/qa/tasks/util/test/__init__.py b/qa/tasks/util/test/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/qa/tasks/util/test/__init__.py
diff --git a/qa/tasks/util/test/test_rados.py b/qa/tasks/util/test/test_rados.py
new file mode 100644
index 000000000..a8f4cb02d
--- /dev/null
+++ b/qa/tasks/util/test/test_rados.py
@@ -0,0 +1,40 @@
+#
+#  The MIT License
+#
+# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+#  Permission is hereby granted, free of charge, to any person
+#  obtaining a copy of this software and associated documentation
+#  files (the "Software"), to deal in the Software without
+#  restriction, including without limitation the rights to use,
+#  copy, modify, merge, publish, distribute, sublicense, and/or sell
+#  copies of the Software, and to permit persons to whom the
+#  Software is furnished to do so, subject to the following
+#  conditions:
+#
+#  The above copyright notice and this permission notice shall be
+#  included in all copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+#  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+#  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+#  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+#  OTHER DEALINGS IN THE SOFTWARE.
+#
+from tasks.util import rados
+
+class TestRados(object):
+
+    def test_cmd_erasure_code_profile(self):
+        name = 'NAME'
+        cmd = rados.cmd_erasure_code_profile(name, {})
+        assert 'k=2' in cmd
+        assert name in cmd
+        cmd = rados.cmd_erasure_code_profile(name, { 'k': '88' })
+        assert 'k=88' in cmd
+        assert name in cmd
diff --git a/qa/tasks/util/workunit.py b/qa/tasks/util/workunit.py
new file mode 100644
index 000000000..1f5623af8
--- /dev/null
+++ b/qa/tasks/util/workunit.py
@@ -0,0 +1,78 @@
+import copy
+
+from teuthology import misc
+from teuthology.orchestra import run
+
+class Refspec:
+    def __init__(self, refspec):
+        self.refspec = refspec
+
+    def __str__(self):
+        return self.refspec
+
+    def _clone(self, git_url, clonedir, opts=None):
+        if opts is None:
+            opts = []
+        return (['rm', '-rf', clonedir] +
+                [run.Raw('&&')] +
+                ['git', 'clone'] + opts +
+                [git_url, clonedir])
+
+    def _cd(self, clonedir):
+        return ['cd', clonedir]
+
+    def _checkout(self):
+        return ['git', 'checkout', self.refspec]
+
+    def clone(self, git_url, clonedir):
+        return (self._clone(git_url, clonedir) +
+                [run.Raw('&&')] +
+                self._cd(clonedir) +
+                [run.Raw('&&')] +
+                self._checkout())
+
+
+class Branch(Refspec):
+    def __init__(self, tag):
+        Refspec.__init__(self, tag)
+
+    def clone(self, git_url, clonedir):
+        opts = ['--depth', '1',
+                '--branch', self.refspec]
+        return (self._clone(git_url, clonedir, opts) +
+                [run.Raw('&&')] +
+                self._cd(clonedir))
+
+
+class Head(Refspec):
+    def __init__(self):
+        Refspec.__init__(self, 'HEAD')
+
+    def clone(self, git_url, clonedir):
+        opts = ['--depth', '1']
+        return (self._clone(git_url, clonedir, opts) +
+                [run.Raw('&&')] +
+                self._cd(clonedir))
+
+
+def get_refspec_after_overrides(config, overrides):
+    # mimic the behavior of the "install" task, where the "overrides" are
+    # actually the defaults of that task. in other words, if none of "sha1",
+    # "tag", or "branch" is specified by a "workunit" tasks, we will update
+    # it with the information in the "workunit" sub-task nested in "overrides".
+    overrides = copy.deepcopy(overrides.get('workunit', {}))
+    refspecs = {'suite_sha1': Refspec, 'suite_branch': Branch,
+                'sha1': Refspec, 'tag': Refspec, 'branch': Branch}
+    if any(map(lambda i: i in config, refspecs.keys())):
+        for i in refspecs.keys():
+            overrides.pop(i, None)
+    misc.deep_merge(config, overrides)
+
+    for spec, cls in refspecs.items():
+        refspec = config.get(spec)
+        if refspec:
+            refspec = cls(refspec)
+            break
+    if refspec is None:
+        refspec = Head()
+    return refspec
diff --git a/qa/tasks/vault.py b/qa/tasks/vault.py
new file mode 100644
index 000000000..2ff008c4d
--- /dev/null
+++ b/qa/tasks/vault.py
@@ -0,0 +1,288 @@
+"""
+Deploy and configure Vault for Teuthology
+"""
+
+import argparse
+import contextlib
+import logging
+import time
+import json
+from os import path
+from http import client as http_client
+from urllib.parse import urljoin
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.orchestra import run
+from teuthology.exceptions import ConfigError, CommandFailedError
+
+
+log = logging.getLogger(__name__)
+
+
+def assign_ports(ctx, config, initial_port):
+    """
+    Assign port numbers starting from @initial_port
+    """
+    port = initial_port
+    role_endpoints = {}
+    for remote, roles_for_host in ctx.cluster.remotes.items():
+        for role in roles_for_host:
+            if role in config:
+                role_endpoints[role] = (remote.name.split('@')[1], port)
+                port += 1
+
+    return role_endpoints
+
+
+@contextlib.contextmanager
+def download(ctx, config):
+    """
+    Download Vault Release from Hashicopr website.
+    Remove downloaded file upon exit.
+    """
+    assert isinstance(config, dict)
+    log.info('Downloading Vault...')
+    testdir = teuthology.get_testdir(ctx)
+
+    for (client, cconf) in config.items():
+        install_url = cconf.get('install_url')
+        install_sha256 = cconf.get('install_sha256')
+        if not install_url or not install_sha256:
+            raise ConfigError("Missing Vault install_url and/or install_sha256")
+        install_zip = path.join(testdir, 'vault.zip')
+        install_dir = path.join(testdir, 'vault')
+
+        log.info('Downloading Vault...')
+        ctx.cluster.only(client).run(
+            args=['curl', '-L', install_url, '-o', install_zip])
+
+        log.info('Verifying SHA256 signature...')
+        ctx.cluster.only(client).run(
+            args=['echo', ' '.join([install_sha256, install_zip]), run.Raw('|'),
+                  'sha256sum', '--check', '--status'])
+
+        log.info('Extracting vault...')
+        ctx.cluster.only(client).run(args=['mkdir', '-p', install_dir])
+        # Using python in case unzip is not installed on hosts
+        # Using python3 in case python is not installed on hosts
+        failed=True
+        for f in [
+                lambda z,d: ['unzip', z, '-d', d],
+                lambda z,d: ['python3', '-m', 'zipfile', '-e', z, d],
+                lambda z,d: ['python', '-m', 'zipfile', '-e', z, d]]:
+            try:
+                ctx.cluster.only(client).run(args=f(install_zip, install_dir))
+                failed = False
+                break
+            except CommandFailedError as e:
+                failed = e
+        if failed:
+            raise failed
+
+    try:
+        yield
+    finally:
+        log.info('Removing Vault...')
+        testdir = teuthology.get_testdir(ctx)
+        for client in config:
+            ctx.cluster.only(client).run(
+                args=['rm', '-rf', install_dir, install_zip])
+
+
+def get_vault_dir(ctx):
+    return '{tdir}/vault'.format(tdir=teuthology.get_testdir(ctx))
+
+
+@contextlib.contextmanager
+def run_vault(ctx, config):
+    assert isinstance(config, dict)
+
+    for (client, cconf) in config.items():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        cluster_name, _, client_id = teuthology.split_role(client)
+
+        _, port = ctx.vault.endpoints[client]
+        listen_addr = "0.0.0.0:{}".format(port)
+
+        root_token = ctx.vault.root_token = cconf.get('root_token', 'root')
+
+        log.info("Starting Vault listening on %s ...", listen_addr)
+        v_params = [
+            '-dev',
+            '-dev-listen-address={}'.format(listen_addr),
+            '-dev-no-store-token',
+            '-dev-root-token-id={}'.format(root_token)
+        ]
+
+        cmd = "chmod +x {vdir}/vault && {vdir}/vault server {vargs}".format(vdir=get_vault_dir(ctx), vargs=" ".join(v_params))
+
+        ctx.daemons.add_daemon(
+            remote, 'vault', client_id,
+            cluster=cluster_name,
+            args=['bash', '-c', cmd, run.Raw('& { read; kill %1; }')],
+            logger=log.getChild(client),
+            stdin=run.PIPE,
+            cwd=get_vault_dir(ctx),
+            wait=False,
+            check_status=False,
+        )
+        time.sleep(10)
+    try:
+        yield
+    finally:
+        log.info('Stopping Vault instance')
+        ctx.daemons.get_daemon('vault', client_id, cluster_name).stop()
+
+
+@contextlib.contextmanager
+def setup_vault(ctx, config):
+    """
+    Mount Transit or KV version 2 secrets engine
+    """
+    (cclient, cconfig) = next(iter(config.items()))
+    engine = cconfig.get('engine')
+
+    if engine == 'kv':
+        log.info('Mounting kv version 2 secrets engine')
+        mount_path = '/v1/sys/mounts/kv'
+        data = {
+            "type": "kv",
+            "options": {
+                "version": "2"
+            }
+        }
+    elif engine == 'transit':
+        log.info('Mounting transit secrets engine')
+        mount_path = '/v1/sys/mounts/transit'
+        data = {
+            "type": "transit"
+        }
+    else:
+        raise Exception("Unknown or missing secrets engine")
+
+    send_req(ctx, cconfig, cclient, mount_path, json.dumps(data))
+    yield
+
+
+def send_req(ctx, cconfig, client, path, body, method='POST'):
+    host, port = ctx.vault.endpoints[client]
+    req = http_client.HTTPConnection(host, port, timeout=30)
+    token = cconfig.get('root_token', 'atoken')
+    log.info("Send request to Vault: %s:%s at %s with token: %s", host, port, path, token)
+    headers = {'X-Vault-Token': token}
+    req.request(method, path, headers=headers, body=body)
+    resp = req.getresponse()
+    log.info(resp.read())
+    if not (resp.status >= 200 and resp.status < 300):
+        raise Exception("Request to Vault server failed with status %d" % resp.status)
+    return resp
+
+
+@contextlib.contextmanager
+def create_secrets(ctx, config):
+    (cclient, cconfig) = next(iter(config.items()))
+    engine = cconfig.get('engine')
+    prefix = cconfig.get('prefix')
+    secrets = cconfig.get('secrets')
+    flavor = cconfig.get('flavor')
+    if secrets is None:
+        raise ConfigError("No secrets specified, please specify some.")
+
+    ctx.vault.keys[cclient] = []
+    for secret in secrets:
+        try:
+            path = secret['path']
+        except KeyError:
+            raise ConfigError('Missing "path" field in secret')
+        exportable = secret.get("exportable", flavor == "old")
+
+        if engine == 'kv':
+            try:
+                data = {
+                    "data": {
+                        "key": secret['secret']
+                    }
+                }
+            except KeyError:
+                raise ConfigError('Missing "secret" field in secret')
+        elif engine == 'transit':
+            data = {"exportable": "true" if exportable else "false"}
+        else:
+            raise Exception("Unknown or missing secrets engine")
+
+        send_req(ctx, cconfig, cclient, urljoin(prefix, path), json.dumps(data))
+
+        ctx.vault.keys[cclient].append({ 'Path': path });
+
+    log.info("secrets created")
+    yield
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Deploy and configure Vault
+
+    Example of configuration:
+
+    tasks:
+    - vault:
+        client.0:
+          install_url: http://my.special.place/vault.zip
+          install_sha256: zipfiles-sha256-sum-much-larger-than-this
+          root_token: test_root_token
+          engine: transit
+          flavor: old
+          prefix: /v1/transit/keys
+          secrets:
+            - path: kv/teuthology/key_a
+              secret: base64_only_if_using_kv_aWxkCmNlcGguY29uZgo=
+              exportable: true
+            - path: kv/teuthology/key_b
+              secret: base64_only_if_using_kv_dApzcmMKVGVzdGluZwo=
+
+    engine can be 'kv' or 'transit'
+    prefix should be /v1/kv/data/ for kv, /v1/transit/keys/ for transit
+    flavor should be 'old' only if testing the original transit logic
+        otherwise omit.
+    for kv only: 256-bit key value should be specified via secret,
+        otherwise should omit.
+    for transit: exportable may be used to make individual keys exportable.
+    flavor may be set to 'old' to make all keys exportable by default,
+        which is required by the original transit logic.
+    """
+    all_clients = ['client.{id}'.format(id=id_)
+                   for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    if config is None:
+        config = all_clients
+    if isinstance(config, list):
+        config = dict.fromkeys(config)
+
+    overrides = ctx.config.get('overrides', {})
+    # merge each client section, not the top level.
+    for client in config.keys():
+        if not config[client]:
+            config[client] = {}
+        teuthology.deep_merge(config[client], overrides.get('vault', {}))
+
+    log.debug('Vault config is %s', config)
+
+    ctx.vault = argparse.Namespace()
+    ctx.vault.endpoints = assign_ports(ctx, config, 8200)
+    ctx.vault.root_token = None
+    ctx.vault.prefix = config[client].get('prefix')
+    ctx.vault.engine = config[client].get('engine')
+    ctx.vault.keys = {}
+    q=config[client].get('flavor')
+    if q:
+        ctx.vault.flavor = q
+
+    with contextutil.nested(
+        lambda: download(ctx=ctx, config=config),
+        lambda: run_vault(ctx=ctx, config=config),
+        lambda: setup_vault(ctx=ctx, config=config),
+        lambda: create_secrets(ctx=ctx, config=config)
+        ):
+        yield
+
diff --git a/qa/tasks/vip.py b/qa/tasks/vip.py
new file mode 100644
index 000000000..52114b104
--- /dev/null
+++ b/qa/tasks/vip.py
@@ -0,0 +1,205 @@
+import contextlib
+import ipaddress
+import logging
+import re
+
+from teuthology import misc as teuthology
+from teuthology.config import config as teuth_config
+
+log = logging.getLogger(__name__)
+
+
+def subst_vip(ctx, cmd):
+    p = re.compile(r'({{VIP(\d+)}})')
+    for m in p.findall(cmd):
+        n = int(m[1])
+        if n >= len(ctx.vip["vips"]):
+            log.warning(f'no VIP{n} (we have {len(ctx.vip["vips"])})')
+        else:
+            cmd = cmd.replace(m[0], str(ctx.vip["vips"][n]))
+
+    if '{{VIPPREFIXLEN}}' in cmd:
+        cmd = cmd.replace('{{VIPPREFIXLEN}}', str(ctx.vip["vnet"].prefixlen))
+
+    if '{{VIPSUBNET}}' in cmd:
+        cmd = cmd.replace('{{VIPSUBNET}}', str(ctx.vip["vnet"].network_address))
+
+    return cmd
+
+
+def echo(ctx, config):
+    """
+    This is mostly for debugging
+    """
+    for remote in ctx.cluster.remotes.keys():
+        log.info(subst_vip(ctx, config))
+
+
+def exec(ctx, config):
+    """
+    This is similar to the standard 'exec' task, but does the VIP substitutions.
+    """
+    assert isinstance(config, dict), "task exec got invalid config"
+
+    testdir = teuthology.get_testdir(ctx)
+
+    if 'all-roles' in config and len(config) == 1:
+        a = config['all-roles']
+        roles = teuthology.all_roles(ctx.cluster)
+        config = dict((id_, a) for id_ in roles if not id_.startswith('host.'))
+    elif 'all-hosts' in config and len(config) == 1:
+        a = config['all-hosts']
+        roles = teuthology.all_roles(ctx.cluster)
+        config = dict((id_, a) for id_ in roles if id_.startswith('host.'))
+
+    for role, ls in config.items():
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+        log.info('Running commands on role %s host %s', role, remote.name)
+        for c in ls:
+            c.replace('$TESTDIR', testdir)
+            remote.run(
+                args=[
+                    'sudo',
+                    'TESTDIR={tdir}'.format(tdir=testdir),
+                    'bash',
+                    '-ex',
+                    '-c',
+                    subst_vip(ctx, c)],
+                )
+
+
+def map_vips(mip, count):
+    for mapping in teuth_config.get('vip', []):
+        mnet = ipaddress.ip_network(mapping['machine_subnet'])
+        vnet = ipaddress.ip_network(mapping['virtual_subnet'])
+        if vnet.prefixlen >= mnet.prefixlen:
+            log.error(f"virtual_subnet {vnet} prefix >= machine_subnet {mnet} prefix")
+            return None
+        if mip in mnet:
+            pos = list(mnet.hosts()).index(mip)
+            log.info(f"{mip} in {mnet}, pos {pos}")
+            r = []
+            for sub in vnet.subnets(new_prefix=mnet.prefixlen):
+                r += [list(sub.hosts())[pos]]
+                count -= 1
+                if count == 0:
+                    break
+            return vnet, r
+    return None
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Set up a virtual network and allocate virtual IP(s) for each machine.
+
+    The strategy here is to set up a private virtual subnet that is larger than
+    the subnet the machine(s) exist in, and allocate virtual IPs from that pool.
+
+    - The teuthology.yaml must include a section like::
+
+        vip:
+          - machine_subnet: 172.21.0.0/20
+            virtual_subnet: 10.0.0.0/16
+
+      At least one item's machine_subnet should map the subnet the test machine's
+      primary IP lives in (the one DNS resolves to).  The virtual_subnet must have a
+      shorter prefix (i.e., larger than the machine_subnet).  If there are multiple
+      machine_subnets, they cannot map into the same virtual_subnet.
+
+    - Each machine gets an IP in the virtual_subset statically configured by the vip
+      task. This lets all test machines reach each other and (most importantly) any
+      virtual IPs.
+
+    - 1 or more virtual IPs are then mapped for the task.  These IPs are chosen based
+      on one of the remotes.  This uses a lot of network space but it avoids any
+      conflicts between tests.
+
+    To use a virtual IP, the {{VIP0}}, {{VIP1}}, etc. substitutions can be used.
+    
+    {{VIPSUBNET}} is the virtual_subnet address (10.0.0.0 in the example).
+
+    {{VIPPREFIXLEN}} is the virtual_subnet prefix (16 in the example.
+
+    These substitutions work for vip.echo, and (at the time of writing) cephadm.apply
+    and cephadm.shell.
+    """
+    if config is None:
+        config = {}
+    count = config.get('count', 1)
+
+    ctx.vip_static = {}
+    ctx.vip = {}
+
+    log.info("Allocating static IPs for each host...")
+    for remote in ctx.cluster.remotes.keys():
+        ip = remote.ssh.get_transport().getpeername()[0]
+        log.info(f'peername {ip}')
+        mip = ipaddress.ip_address(ip)
+        vnet, vips = map_vips(mip, count + 1)
+        static = vips.pop(0)
+        log.info(f"{remote.hostname} static {static}, vnet {vnet}")
+
+        if not ctx.vip:
+            # do this only once (use the first remote we see), since we only need 1
+            # set of virtual IPs, regardless of how many remotes we have.
+            log.info("VIPs are {map(str, vips)}")
+            ctx.vip = {
+                'vnet': vnet,
+                'vips': vips,
+            }
+        else:
+            # all remotes must be in the same virtual network...
+            assert vnet == ctx.vip['vnet']
+
+        # pick interface
+        p = re.compile(r'^(\S+) dev (\S+) (.*)scope link (.*)src (\S+)')
+        iface = None
+        for line in remote.sh(['sudo', 'ip','route','ls']).splitlines():
+            m = p.findall(line)
+            if not m:
+                continue
+            route_iface = m[0][1]
+            route_ip = m[0][4]
+            if route_ip == ip:
+                iface = route_iface
+                break
+
+        if not iface:
+            log.error(f"Unable to find {remote.hostname} interface for {ip}")
+            continue
+
+        # configure
+        log.info(f"Configuring {static} on {remote.hostname} iface {iface}...")
+        remote.sh(['sudo',
+                   'ip', 'addr', 'add',
+                   str(static) + '/' + str(vnet.prefixlen),
+                   'dev', iface])
+
+        ctx.vip_static[remote] = {
+            "iface": iface,
+            "static": static,
+        }
+
+    try:
+        yield
+
+    finally:
+        for remote, m in ctx.vip_static.items():
+            log.info(f"Removing {m['static']} (and any VIPs) on {remote.hostname} iface {m['iface']}...")
+            remote.sh(['sudo',
+                       'ip', 'addr', 'del',
+                       str(m['static']) + '/' + str(ctx.vip['vnet'].prefixlen),
+                       'dev', m['iface']])
+
+            for vip in ctx.vip['vips']:
+                remote.sh(
+                    [
+                        'sudo',
+                        'ip', 'addr', 'del',
+                        str(vip) + '/' + str(ctx.vip['vnet'].prefixlen),
+                        'dev', m['iface']
+                    ],
+                    check_status=False,
+                )
+            
diff --git a/qa/tasks/vstart_runner.py b/qa/tasks/vstart_runner.py
new file mode 100644
index 000000000..db0cb41cf
--- /dev/null
+++ b/qa/tasks/vstart_runner.py
@@ -0,0 +1,1769 @@
+"""
+vstart_runner: override Filesystem and Mount interfaces to run a CephFSTestCase against a vstart
+ceph instance instead of a packaged/installed cluster.  Use this to turn around test cases
+quickly during development.
+
+Simple usage (assuming teuthology and ceph checked out in ~/git):
+
+    # Activate the teuthology virtualenv
+    source ~/git/teuthology/virtualenv/bin/activate
+    # Go into your ceph build directory
+    cd ~/git/ceph/build
+    # Invoke a test using this script
+    python ~/git/ceph/qa/tasks/vstart_runner.py --create tasks.cephfs.test_data_scan
+
+Alternative usage:
+
+    # Alternatively, if you use different paths, specify them as follows:
+    LD_LIBRARY_PATH=`pwd`/lib PYTHONPATH=~/git/teuthology:~/git/ceph/qa:`pwd`/../src/pybind:`pwd`/lib/cython_modules/lib.3 python ~/git/ceph/qa/tasks/vstart_runner.py
+
+    # If you wish to drop to a python shell on failures, use --interactive:
+    python ~/git/ceph/qa/tasks/vstart_runner.py --interactive
+
+    # If you wish to run a named test case, pass it as an argument:
+    python ~/git/ceph/qa/tasks/vstart_runner.py tasks.cephfs.test_data_scan
+
+    # Also, you can create the cluster once and then run named test cases against it:
+    python ~/git/ceph/qa/tasks/vstart_runner.py --create-cluster-only
+    python ~/git/ceph/qa/tasks/vstart_runner.py tasks.mgr.dashboard.test_health
+    python ~/git/ceph/qa/tasks/vstart_runner.py tasks.mgr.dashboard.test_rgw
+
+"""
+
+from io import StringIO
+from collections import defaultdict
+import getpass
+import signal
+import tempfile
+import threading
+import datetime
+import shutil
+import re
+import os
+import time
+import sys
+import errno
+from IPy import IP
+import unittest
+import platform
+import logging
+
+from unittest import suite, loader
+
+from teuthology.orchestra.run import Raw, quote
+from teuthology.orchestra.daemon import DaemonGroup
+from teuthology.orchestra.remote import Remote
+from teuthology.config import config as teuth_config
+from teuthology.contextutil import safe_while
+from teuthology.contextutil import MaxWhileTries
+from teuthology.orchestra.run import CommandFailedError
+try:
+    import urllib3
+    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+except:
+    pass
+
+def init_log():
+    global log
+    if log is not None:
+        del log
+    log = logging.getLogger(__name__)
+
+    global logpath
+    logpath = './vstart_runner.log'
+
+    handler = logging.FileHandler(logpath)
+    formatter = logging.Formatter(
+        fmt=u'%(asctime)s.%(msecs)03d %(levelname)s:%(name)s:%(message)s',
+        datefmt='%Y-%m-%dT%H:%M:%S')
+    handler.setFormatter(formatter)
+    log.addHandler(handler)
+    log.setLevel(logging.INFO)
+
+log = None
+init_log()
+
+
+def respawn_in_path(lib_path, python_paths):
+    execv_cmd = ['python']
+    if platform.system() == "Darwin":
+        lib_path_var = "DYLD_LIBRARY_PATH"
+    else:
+        lib_path_var = "LD_LIBRARY_PATH"
+
+    py_binary = os.environ.get("PYTHON", sys.executable)
+
+    if lib_path_var in os.environ:
+        if lib_path not in os.environ[lib_path_var]:
+            os.environ[lib_path_var] += ':' + lib_path
+            os.execvp(py_binary, execv_cmd + sys.argv)
+    else:
+        os.environ[lib_path_var] = lib_path
+        os.execvp(py_binary, execv_cmd + sys.argv)
+
+    for p in python_paths:
+        sys.path.insert(0, p)
+
+
+# Let's use some sensible defaults
+if os.path.exists("./CMakeCache.txt") and os.path.exists("./bin"):
+
+    # A list of candidate paths for each package we need
+    guesses = [
+        ["~/git/teuthology", "~/scm/teuthology", "~/teuthology"],
+        ["lib/cython_modules/lib.3"],
+        ["../src/pybind"],
+    ]
+
+    python_paths = []
+
+    # Up one level so that "tasks.foo.bar" imports work
+    python_paths.append(os.path.abspath(
+        os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
+    ))
+
+    for package_guesses in guesses:
+        for g in package_guesses:
+            g_exp = os.path.abspath(os.path.expanduser(g))
+            if os.path.exists(g_exp):
+                python_paths.append(g_exp)
+
+    ld_path = os.path.join(os.getcwd(), "lib/")
+    print("Using guessed paths {0} {1}".format(ld_path, python_paths))
+    respawn_in_path(ld_path, python_paths)
+
+
+try:
+    from tasks.ceph_manager import CephManager
+    from tasks.cephfs.fuse_mount import FuseMount
+    from tasks.cephfs.kernel_mount import KernelMount
+    from tasks.cephfs.filesystem import Filesystem, MDSCluster, CephCluster
+    from tasks.cephfs.mount import CephFSMount
+    from tasks.mgr.mgr_test_case import MgrCluster
+    from teuthology.task import interactive
+except ImportError:
+    sys.stderr.write("***\nError importing packages, have you activated your teuthology virtualenv "
+                     "and set PYTHONPATH to point to teuthology and ceph-qa-suite?\n***\n\n")
+    raise
+
+# Must import after teuthology because of gevent monkey patching
+import subprocess
+
+if os.path.exists("./CMakeCache.txt"):
+    # Running in build dir of a cmake build
+    BIN_PREFIX = "./bin/"
+    SRC_PREFIX = "../src"
+else:
+    # Running in src/ of an autotools build
+    BIN_PREFIX = "./"
+    SRC_PREFIX = "./"
+
+
+def rm_nonascii_chars(var):
+    var = var.replace(b'\xe2\x80\x98', b'\'')
+    var = var.replace(b'\xe2\x80\x99', b'\'')
+    return var
+
+class LocalRemoteProcess(object):
+    def __init__(self, args, subproc, check_status, stdout, stderr):
+        self.args = args
+        self.subproc = subproc
+        self.stdout = stdout
+        self.stderr = stderr
+        # this variable is meant for instance of this class named fuse_daemon.
+        # child process of the command launched with sudo must be killed,
+        # since killing parent process alone has no impact on the child
+        # process.
+        self.fuse_pid = -1
+
+        self.check_status = check_status
+        self.exitstatus = self.returncode = None
+
+    def wait(self):
+        if self.finished:
+            # Avoid calling communicate() on a dead process because it'll
+            # give you stick about std* already being closed
+            if self.check_status and self.exitstatus != 0:
+                raise CommandFailedError(self.args, self.exitstatus)
+            else:
+                return
+
+        out, err = self.subproc.communicate()
+        out, err = rm_nonascii_chars(out), rm_nonascii_chars(err)
+        if isinstance(self.stdout, StringIO):
+            self.stdout.write(out.decode(errors='ignore'))
+        elif self.stdout is None:
+            pass
+        else:
+            self.stdout.write(out)
+        if isinstance(self.stderr, StringIO):
+            self.stderr.write(err.decode(errors='ignore'))
+        elif self.stderr is None:
+            pass
+        else:
+            self.stderr.write(err)
+
+        self.exitstatus = self.returncode = self.subproc.returncode
+
+        if self.exitstatus != 0:
+            sys.stderr.write(out.decode())
+            sys.stderr.write(err.decode())
+
+        if self.check_status and self.exitstatus != 0:
+            raise CommandFailedError(self.args, self.exitstatus)
+
+    @property
+    def finished(self):
+        if self.exitstatus is not None:
+            return True
+
+        if self.subproc.poll() is not None:
+            out, err = self.subproc.communicate()
+            if isinstance(self.stdout, StringIO):
+                self.stdout.write(out.decode(errors='ignore'))
+            elif self.stdout is None:
+                pass
+            else:
+                self.stdout.write(out)
+            if isinstance(self.stderr, StringIO):
+                self.stderr.write(err.decode(errors='ignore'))
+            elif self.stderr is None:
+                pass
+            else:
+                self.stderr.write(err)
+            self.exitstatus = self.returncode = self.subproc.returncode
+            return True
+        else:
+            return False
+
+    def kill(self):
+        log.debug("kill ")
+        if self.subproc.pid and not self.finished:
+            log.debug("kill: killing pid {0} ({1})".format(
+                self.subproc.pid, self.args))
+            if self.fuse_pid != -1:
+                safe_kill(self.fuse_pid)
+            else:
+                safe_kill(self.subproc.pid)
+        else:
+            log.debug("kill: already terminated ({0})".format(self.args))
+
+    @property
+    def stdin(self):
+        class FakeStdIn(object):
+            def __init__(self, mount_daemon):
+                self.mount_daemon = mount_daemon
+
+            def close(self):
+                self.mount_daemon.kill()
+
+        return FakeStdIn(self)
+
+
+class LocalRemote(object):
+    """
+    Amusingly named class to present the teuthology RemoteProcess interface when we are really
+    running things locally for vstart
+
+    Run this inside your src/ dir!
+    """
+
+    os = Remote.os
+    arch = Remote.arch
+
+    def __init__(self):
+        self.name = "local"
+        self.hostname = "localhost"
+        self.user = getpass.getuser()
+
+    def get_file(self, path, sudo, dest_dir):
+        tmpfile = tempfile.NamedTemporaryFile(delete=False).name
+        shutil.copy(path, tmpfile)
+        return tmpfile
+
+    # XXX: This method ignores the error raised when src and dst are
+    # holding same path. For teuthology, same path still represents
+    # different locations as they lie on different machines.
+    def put_file(self, src, dst, sudo=False):
+        try:
+            shutil.copy(src, dst)
+        except shutil.SameFileError:
+            pass
+
+    # XXX: accepts only two arugments to maintain compatibility with
+    # teuthology's mkdtemp.
+    def mkdtemp(self, suffix='', parentdir=None):
+        from tempfile import mkdtemp
+
+        # XXX: prefix had to be set without that this method failed against
+        # Python2.7 -
+        # > /usr/lib64/python2.7/tempfile.py(337)mkdtemp()
+        # -> file = _os.path.join(dir, prefix + name + suffix)
+        # (Pdb) p prefix
+        # None
+        return mkdtemp(suffix=suffix, prefix='', dir=parentdir)
+
+    def mktemp(self, suffix=None, parentdir=None):
+        """
+        Make a remote temporary file
+
+        Returns: the path of the temp file created.
+        """
+        from tempfile import mktemp
+        return mktemp(suffix=suffix, dir=parentdir)
+
+    def write_file(self, path, data, sudo=False, mode=None, owner=None,
+                                     mkdir=False, append=False):
+        """
+        Write data to file
+
+        :param path:    file path on host
+        :param data:    str, binary or fileobj to be written
+        :param sudo:    use sudo to write file, defaults False
+        :param mode:    set file mode bits if provided
+        :param owner:   set file owner if provided
+        :param mkdir:   preliminary create the file directory, defaults False
+        :param append:  append data to the file, defaults False
+        """
+        dd = 'sudo dd' if sudo else 'dd'
+        args = dd + ' of=' + path
+        if append:
+            args += ' conv=notrunc oflag=append'
+        if mkdir:
+            mkdirp = 'sudo mkdir -p' if sudo else 'mkdir -p'
+            dirpath = os.path.dirname(path)
+            if dirpath:
+                args = mkdirp + ' ' + dirpath + '\n' + args
+        if mode:
+            chmod = 'sudo chmod' if sudo else 'chmod'
+            args += '\n' + chmod + ' ' + mode + ' ' + path
+        if owner:
+            chown = 'sudo chown' if sudo else 'chown'
+            args += '\n' + chown + ' ' + owner + ' ' + path
+        omit_sudo = False if sudo else True
+        self.run(args=args, stdin=data, omit_sudo=omit_sudo)
+
+    def sudo_write_file(self, path, data, **kwargs):
+        """
+        Write data to file with sudo, for more info see `write_file()`.
+        """
+        self.write_file(path, data, sudo=True, **kwargs)
+
+    def _perform_checks_and_return_list_of_args(self, args, omit_sudo):
+        # Since Python's shell simulation can only work when commands are
+        # provided as a list of argumensts...
+        if isinstance(args, str):
+            args = args.split()
+
+        # We'll let sudo be a part of command even omit flag says otherwise in
+        # cases of commands which can normally be ran only by root.
+        try:
+            if args[args.index('sudo') + 1] in ['-u', 'passwd', 'chown']:
+                omit_sudo = False
+        except ValueError:
+            pass
+
+        # Quotes wrapping a command argument don't work fine in Python's shell
+        # simulation if the arguments contains spaces too. E.g. '"ls"' is OK
+        # but "ls /" isn't.
+        errmsg = "Don't surround arguments commands by quotes if it " + \
+                 "contains spaces.\nargs - %s" % (args)
+        for arg in args:
+            if isinstance(arg, Raw):
+                continue
+
+            if arg and (arg[0] in ['"', "'"] or arg[-1] in ['"', "'"]) and \
+               (arg.find(' ') != -1 and 0 < arg.find(' ') < len(arg) - 1):
+                raise RuntimeError(errmsg)
+
+        # ['sudo', '-u', 'user', '-s', 'path-to-shell', '-c', 'ls', 'a']
+        # and ['sudo', '-u', user, '-s', path_to_shell, '-c', 'ls a'] are
+        # treated differently by Python's shell simulation. Only latter has
+        # the desired effect.
+        errmsg = 'The entire command to executed as other user should be a ' +\
+                 'single argument.\nargs - %s' % (args)
+        if 'sudo' in args and '-u' in args and '-c' in args and \
+           args.count('-c') == 1:
+            if args.index('-c') != len(args) - 2 and \
+               args[args.index('-c') + 2].find('-') == -1:
+                raise RuntimeError(errmsg)
+
+        if omit_sudo:
+            args = [a for a in args if a != "sudo"]
+
+        return args
+
+    # Wrapper to keep the interface exactly same as that of
+    # teuthology.remote.run.
+    def run(self, **kwargs):
+        return self._do_run(**kwargs)
+
+    # XXX: omit_sudo is set to True since using sudo can change the ownership
+    # of files which becomes problematic for following executions of
+    # vstart_runner.py.
+    def _do_run(self, args, check_status=True, wait=True, stdout=None,
+                stderr=None, cwd=None, stdin=None, logger=None, label=None,
+                env=None, timeout=None, omit_sudo=True):
+        args = self._perform_checks_and_return_list_of_args(args, omit_sudo)
+
+        # We have to use shell=True if any run.Raw was present, e.g. &&
+        shell = any([a for a in args if isinstance(a, Raw)])
+
+        # Filter out helper tools that don't exist in a vstart environment
+        args = [a for a in args if a not in ('adjust-ulimits',
+                                             'ceph-coverage')]
+
+        # Adjust binary path prefix if given a bare program name
+        if not isinstance(args[0], Raw) and "/" not in args[0]:
+            # If they asked for a bare binary name, and it exists
+            # in our built tree, use the one there.
+            local_bin = os.path.join(BIN_PREFIX, args[0])
+            if os.path.exists(local_bin):
+                args = [local_bin] + args[1:]
+            else:
+                log.debug("'{0}' is not a binary in the Ceph build dir".format(
+                    args[0]
+                ))
+
+        log.debug('> ' +
+                 ' '.join([str(a.value) if isinstance(a, Raw) else a for a in args]))
+
+        if shell:
+            subproc = subprocess.Popen(quote(args),
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.PIPE,
+                                       stdin=subprocess.PIPE,
+                                       cwd=cwd,
+                                       env=env,
+                                       shell=True)
+        else:
+            # Sanity check that we've got a list of strings
+            for arg in args:
+                if not isinstance(arg, str):
+                    raise RuntimeError("Oops, can't handle arg {0} type {1}".format(
+                        arg, arg.__class__
+                    ))
+
+            subproc = subprocess.Popen(args,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.PIPE,
+                                       stdin=subprocess.PIPE,
+                                       cwd=cwd,
+                                       env=env)
+
+        if stdin:
+            # Hack: writing to stdin is not deadlock-safe, but it "always" works
+            # as long as the input buffer is "small"
+            if isinstance(stdin, str):
+                subproc.stdin.write(stdin.encode())
+            else:
+                subproc.stdin.write(stdin)
+
+        proc = LocalRemoteProcess(
+            args, subproc, check_status,
+            stdout, stderr
+        )
+
+        if wait:
+            proc.wait()
+
+        return proc
+
+    # XXX: for compatibility keep this method same as teuthology.orchestra.remote.sh
+    # BytesIO is being used just to keep things identical
+    def sh(self, script, **kwargs):
+        """
+        Shortcut for run method.
+
+        Usage:
+            my_name = remote.sh('whoami')
+            remote_date = remote.sh('date')
+        """
+        from io import BytesIO
+
+        if 'stdout' not in kwargs:
+            kwargs['stdout'] = BytesIO()
+        if 'args' not in kwargs:
+            kwargs['args'] = script
+        proc = self.run(**kwargs)
+        out = proc.stdout.getvalue()
+        if isinstance(out, bytes):
+            return out.decode()
+        else:
+            return out
+
+class LocalDaemon(object):
+    def __init__(self, daemon_type, daemon_id):
+        self.daemon_type = daemon_type
+        self.daemon_id = daemon_id
+        self.controller = LocalRemote()
+        self.proc = None
+
+    @property
+    def remote(self):
+        return LocalRemote()
+
+    def running(self):
+        return self._get_pid() is not None
+
+    def check_status(self):
+        if self.proc:
+            return self.proc.poll()
+
+    def _get_pid(self):
+        """
+        Return PID as an integer or None if not found
+        """
+        ps_txt = self.controller.run(args=["ps", "ww", "-u"+str(os.getuid())],
+                                     stdout=StringIO()).\
+            stdout.getvalue().strip()
+        lines = ps_txt.split("\n")[1:]
+
+        for line in lines:
+            if line.find("ceph-{0} -i {1}".format(self.daemon_type, self.daemon_id)) != -1:
+                log.debug("Found ps line for daemon: {0}".format(line))
+                return int(line.split()[0])
+        if opt_log_ps_output:
+            log.debug("No match for {0} {1}: {2}".format(
+                self.daemon_type, self.daemon_id, ps_txt))
+        else:
+            log.debug("No match for {0} {1}".format(self.daemon_type,
+                     self.daemon_id))
+            return None
+
+    def wait(self, timeout):
+        waited = 0
+        while self._get_pid() is not None:
+            if waited > timeout:
+                raise MaxWhileTries("Timed out waiting for daemon {0}.{1}".format(self.daemon_type, self.daemon_id))
+            time.sleep(1)
+            waited += 1
+
+    def stop(self, timeout=300):
+        if not self.running():
+            log.error('tried to stop a non-running daemon')
+            return
+
+        pid = self._get_pid()
+        log.debug("Killing PID {0} for {1}.{2}".format(pid, self.daemon_type, self.daemon_id))
+        os.kill(pid, signal.SIGTERM)
+
+        waited = 0
+        while pid is not None:
+            new_pid = self._get_pid()
+            if new_pid is not None and new_pid != pid:
+                log.debug("Killing new PID {0}".format(new_pid))
+                pid = new_pid
+                os.kill(pid, signal.SIGTERM)
+
+            if new_pid is None:
+                break
+            else:
+                if waited > timeout:
+                    raise MaxWhileTries(
+                        "Timed out waiting for daemon {0}.{1}".format(
+                            self.daemon_type, self.daemon_id))
+                time.sleep(1)
+                waited += 1
+
+        self.wait(timeout=timeout)
+
+    def restart(self):
+        if self._get_pid() is not None:
+            self.stop()
+
+        self.proc = self.controller.run(args=[
+            os.path.join(BIN_PREFIX, "./ceph-{0}".format(self.daemon_type)),
+            "-i", self.daemon_id])
+
+    def signal(self, sig, silent=False):
+        if not self.running():
+            raise RuntimeError("Can't send signal to non-running daemon")
+
+        os.kill(self._get_pid(), sig)
+        if not silent:
+            log.debug("Sent signal {0} to {1}.{2}".format(sig, self.daemon_type, self.daemon_id))
+
+
+def safe_kill(pid):
+    """
+    os.kill annoyingly raises exception if process already dead.  Ignore it.
+    """
+    try:
+        return os.kill(pid, signal.SIGKILL)
+    except OSError as e:
+        if e.errno == errno.ESRCH:
+            # Raced with process termination
+            pass
+        else:
+            raise
+
+def mon_in_localhost(config_path="./ceph.conf"):
+    """
+    If the ceph cluster is using the localhost IP as mon host, will must disable ns unsharing
+    """
+    with open(config_path) as f:
+        for line in f:
+            local = re.match(r'^\s*mon host\s*=\s*\[((v1|v2):127\.0\.0\.1:\d+,?)+\]', line)
+            if local:
+                return True
+    return False
+
+class LocalKernelMount(KernelMount):
+    def __init__(self, ctx, test_dir, client_id=None,
+                 client_keyring_path=None, client_remote=None,
+                 hostfs_mntpt=None, cephfs_name=None, cephfs_mntpt=None,
+                 brxnet=None):
+        super(LocalKernelMount, self).__init__(ctx=ctx, test_dir=test_dir,
+            client_id=client_id, client_keyring_path=client_keyring_path,
+            client_remote=LocalRemote(), hostfs_mntpt=hostfs_mntpt,
+            cephfs_name=cephfs_name, cephfs_mntpt=cephfs_mntpt, brxnet=brxnet)
+
+    @property
+    def config_path(self):
+        return "./ceph.conf"
+
+    def get_keyring_path(self):
+        # This is going to end up in a config file, so use an absolute path
+        # to avoid assumptions about daemons' pwd
+        keyring_path = "./client.{0}.keyring".format(self.client_id)
+        try:
+            os.stat(keyring_path)
+        except OSError:
+            return os.path.join(os.getcwd(), 'keyring')
+        else:
+            return keyring_path
+
+    def setupfs(self, name=None):
+        if name is None and self.fs is not None:
+            # Previous mount existed, reuse the old name
+            name = self.fs.name
+        self.fs = LocalFilesystem(self.ctx, name=name)
+        log.debug('Wait for MDS to reach steady state...')
+        self.fs.wait_for_daemons()
+        log.debug('Ready to start {}...'.format(type(self).__name__))
+
+    @property
+    def _prefix(self):
+        return BIN_PREFIX
+
+    def _asok_path(self):
+        # In teuthology, the asok is named after the PID of the ceph-fuse process, because it's
+        # run foreground.  When running it daemonized however, the asok is named after
+        # the PID of the launching process, not the long running ceph-fuse process.  Therefore
+        # we need to give an exact path here as the logic for checking /proc/ for which
+        # asok is alive does not work.
+
+        # Load the asok path from ceph.conf as vstart.sh now puts admin sockets
+        # in a tmpdir. All of the paths are the same, so no need to select
+        # based off of the service type.
+        d = "./out"
+        with open(self.config_path) as f:
+            for line in f:
+                asok_conf = re.search("^\s*admin\s+socket\s*=\s*(.*?)[^/]+$", line)
+                if asok_conf:
+                    d = asok_conf.groups(1)[0]
+                    break
+        path = "{0}/client.{1}.*.asok".format(d, self.client_id)
+        return path
+
+    def mount(self, mntopts=[], createfs=True, check_status=True, **kwargs):
+        self.update_attrs(**kwargs)
+        self.assert_and_log_minimum_mount_details()
+
+        if opt_use_ns:
+            self.using_namespace = True
+            self.setup_netns()
+        else:
+            self.using_namespace = False
+
+        if not self.cephfs_mntpt:
+            self.cephfs_mntpt = "/"
+        # TODO: don't call setupfs() from within mount()
+        if createfs:
+            self.setupfs(name=self.cephfs_name)
+
+        opts = 'norequire_active_mds'
+        if self.client_id:
+            opts += ',name=' + self.client_id
+        if self.client_keyring_path and self.client_id:
+            opts += ",secret=" + self.get_key_from_keyfile()
+        if self.config_path:
+            opts += ',conf=' + self.config_path
+        if self.cephfs_name:
+            opts += ",mds_namespace={0}".format(self.cephfs_name)
+        if mntopts:
+            opts += ',' + ','.join(mntopts)
+
+        stderr = StringIO()
+        try:
+            self.client_remote.run(args=['mkdir', '--', self.hostfs_mntpt],
+                                   timeout=(5*60), stderr=stderr)
+        except CommandFailedError:
+            if 'file exists' not in stderr.getvalue().lower():
+                raise
+
+        if self.cephfs_mntpt is None:
+            self.cephfs_mntpt = "/"
+        cmdargs = ['sudo']
+        if self.using_namespace:
+           cmdargs += ['nsenter',
+                       '--net=/var/run/netns/{0}'.format(self.netns_name)]
+        cmdargs += ['./bin/mount.ceph', ':' + self.cephfs_mntpt,
+                    self.hostfs_mntpt, '-v', '-o', opts]
+
+        mountcmd_stdout, mountcmd_stderr = StringIO(), StringIO()
+        try:
+            self.client_remote.run(args=cmdargs, timeout=(30*60),
+                omit_sudo=False, stdout=mountcmd_stdout,
+                stderr=mountcmd_stderr)
+        except CommandFailedError as e:
+            if check_status:
+                raise
+            else:
+                return (e, mountcmd_stdout.getvalue(),
+                        mountcmd_stderr.getvalue())
+
+        stderr = StringIO()
+        try:
+            self.client_remote.run(args=['sudo', 'chmod', '1777',
+                                   self.hostfs_mntpt], stderr=stderr,
+                                   timeout=(5*60))
+        except CommandFailedError:
+            # the client does not have write permissions in cap it holds for
+            # the Ceph FS that was just mounted.
+            if 'permission denied' in stderr.getvalue().lower():
+                pass
+
+        self.mounted = True
+
+    def cleanup_netns(self):
+        if self.using_namespace:
+            super(type(self), self).cleanup_netns()
+
+    def _run_python(self, pyscript, py_version='python'):
+        """
+        Override this to remove the daemon-helper prefix that is used otherwise
+        to make the process killable.
+        """
+        return self.client_remote.run(args=[py_version, '-c', pyscript],
+                                      wait=False, stdout=StringIO())
+
+class LocalFuseMount(FuseMount):
+    def __init__(self, ctx, test_dir, client_id, client_keyring_path=None,
+                 client_remote=None, hostfs_mntpt=None, cephfs_name=None,
+                 cephfs_mntpt=None, brxnet=None):
+        super(LocalFuseMount, self).__init__(ctx=ctx, client_config=None,
+            test_dir=test_dir, client_id=client_id,
+            client_keyring_path=client_keyring_path,
+            client_remote=LocalRemote(), hostfs_mntpt=hostfs_mntpt,
+            cephfs_name=cephfs_name, cephfs_mntpt=cephfs_mntpt, brxnet=brxnet)
+
+    @property
+    def config_path(self):
+        return "./ceph.conf"
+
+    def get_keyring_path(self):
+        # This is going to end up in a config file, so use an absolute path
+        # to avoid assumptions about daemons' pwd
+        return os.path.abspath("./client.{0}.keyring".format(self.client_id))
+
+    def setupfs(self, name=None):
+        if name is None and self.fs is not None:
+            # Previous mount existed, reuse the old name
+            name = self.fs.name
+        self.fs = LocalFilesystem(self.ctx, name=name)
+        log.debug('Wait for MDS to reach steady state...')
+        self.fs.wait_for_daemons()
+        log.debug('Ready to start {}...'.format(type(self).__name__))
+
+    @property
+    def _prefix(self):
+        return BIN_PREFIX
+
+    def _asok_path(self):
+        # In teuthology, the asok is named after the PID of the ceph-fuse process, because it's
+        # run foreground.  When running it daemonized however, the asok is named after
+        # the PID of the launching process, not the long running ceph-fuse process.  Therefore
+        # we need to give an exact path here as the logic for checking /proc/ for which
+        # asok is alive does not work.
+
+        # Load the asok path from ceph.conf as vstart.sh now puts admin sockets
+        # in a tmpdir. All of the paths are the same, so no need to select
+        # based off of the service type.
+        d = "./out"
+        with open(self.config_path) as f:
+            for line in f:
+                asok_conf = re.search("^\s*admin\s+socket\s*=\s*(.*?)[^/]+$", line)
+                if asok_conf:
+                    d = asok_conf.groups(1)[0]
+                    break
+        path = "{0}/client.{1}.*.asok".format(d, self.client_id)
+        return path
+
+    def mount(self, mntopts=[], createfs=True, check_status=True, **kwargs):
+        self.update_attrs(**kwargs)
+        self.assert_and_log_minimum_mount_details()
+
+        if opt_use_ns:
+            self.using_namespace = True
+            self.setup_netns()
+        else:
+            self.using_namespace = False
+
+        # TODO: don't call setupfs() from within mount()
+        if createfs:
+            self.setupfs(name=self.cephfs_name)
+
+        stderr = StringIO()
+        try:
+            self.client_remote.run(args=['mkdir', '-p', self.hostfs_mntpt],
+                                   stderr=stderr)
+        except CommandFailedError:
+            if 'file exists' not in stderr.getvalue().lower():
+                raise
+
+        def list_connections():
+            self.client_remote.run(
+                args=["mount", "-t", "fusectl", "/sys/fs/fuse/connections", "/sys/fs/fuse/connections"],
+                check_status=False
+            )
+
+            p = self.client_remote.run(args=["ls", "/sys/fs/fuse/connections"],
+                                       check_status=False, stdout=StringIO())
+            if p.exitstatus != 0:
+                log.warning("ls conns failed with {0}, assuming none".format(p.exitstatus))
+                return []
+
+            ls_str = p.stdout.getvalue().strip()
+            if ls_str:
+                return [int(n) for n in ls_str.split("\n")]
+            else:
+                return []
+
+        # Before starting ceph-fuse process, note the contents of
+        # /sys/fs/fuse/connections
+        pre_mount_conns = list_connections()
+        log.debug("Pre-mount connections: {0}".format(pre_mount_conns))
+
+        cmdargs = []
+        if self.using_namespace:
+            cmdargs = ['sudo', 'nsenter',
+                       '--net=/var/run/netns/{0}'.format(self.netns_name),
+                       '--setuid', str(os.getuid())]
+        cmdargs += [os.path.join(BIN_PREFIX, 'ceph-fuse'), self.hostfs_mntpt,
+                    '-f']
+        if self.client_id is not None:
+            cmdargs += ["--id", self.client_id]
+        if self.client_keyring_path and self.client_id is not None:
+            cmdargs.extend(['-k', self.client_keyring_path])
+        if self.cephfs_name:
+            cmdargs += ["--client_fs=" + self.cephfs_name]
+        if self.cephfs_mntpt:
+            cmdargs += ["--client_mountpoint=" + self.cephfs_mntpt]
+        if os.getuid() != 0:
+            cmdargs += ["--client_die_on_failed_dentry_invalidate=false"]
+        if mntopts:
+            cmdargs += mntopts
+
+        mountcmd_stdout, mountcmd_stderr = StringIO(), StringIO()
+        self.fuse_daemon = self.client_remote.run(args=cmdargs, wait=False,
+            omit_sudo=False, stdout=mountcmd_stdout, stderr=mountcmd_stderr)
+        self._set_fuse_daemon_pid(check_status)
+        log.debug("Mounting client.{0} with pid "
+                 "{1}".format(self.client_id, self.fuse_daemon.subproc.pid))
+
+        # Wait for the connection reference to appear in /sys
+        waited = 0
+        post_mount_conns = list_connections()
+        while len(post_mount_conns) <= len(pre_mount_conns):
+            if self.fuse_daemon.finished:
+                # Did mount fail?  Raise the CommandFailedError instead of
+                # hitting the "failed to populate /sys/" timeout
+                try:
+                    self.fuse_daemon.wait()
+                except CommandFailedError as e:
+                    if check_status:
+                        raise
+                    else:
+                        return (e, mountcmd_stdout.getvalue(),
+                                mountcmd_stderr.getvalue())
+            time.sleep(1)
+            waited += 1
+            if waited > 30:
+                raise RuntimeError("Fuse mount failed to populate /sys/ after {0} seconds".format(
+                    waited
+                ))
+            post_mount_conns = list_connections()
+
+        log.debug("Post-mount connections: {0}".format(post_mount_conns))
+
+        # Record our fuse connection number so that we can use it when
+        # forcing an unmount
+        new_conns = list(set(post_mount_conns) - set(pre_mount_conns))
+        if len(new_conns) == 0:
+            raise RuntimeError("New fuse connection directory not found ({0})".format(new_conns))
+        elif len(new_conns) > 1:
+            raise RuntimeError("Unexpectedly numerous fuse connections {0}".format(new_conns))
+        else:
+            self._fuse_conn = new_conns[0]
+
+        self.gather_mount_info()
+
+        self.mounted = True
+
+    def _set_fuse_daemon_pid(self, check_status):
+        # NOTE: When a command <args> is launched with sudo, two processes are
+        # launched, one with sudo in <args> and other without. Make sure we
+        # get the PID of latter one.
+        try:
+            with safe_while(sleep=1, tries=15) as proceed:
+                while proceed():
+                    try:
+                        sock = self.find_admin_socket()
+                    except (RuntimeError, CommandFailedError):
+                        continue
+
+                    self.fuse_daemon.fuse_pid = int(re.match(".*\.(\d+)\.asok$",
+                                                             sock).group(1))
+                    break
+        except MaxWhileTries:
+            if check_status:
+                raise
+            else:
+                pass
+
+    def cleanup_netns(self):
+        if self.using_namespace:
+            super(type(self), self).cleanup_netns()
+
+    def _run_python(self, pyscript, py_version='python'):
+        """
+        Override this to remove the daemon-helper prefix that is used otherwise
+        to make the process killable.
+        """
+        return self.client_remote.run(args=[py_version, '-c', pyscript],
+                                      wait=False, stdout=StringIO())
+
+# XXX: this class has nothing to do with the Ceph daemon (ceph-mgr) of
+# the same name.
+class LocalCephManager(CephManager):
+    def __init__(self):
+        # Deliberately skip parent init, only inheriting from it to get
+        # util methods like osd_dump that sit on top of raw_cluster_cmd
+        self.controller = LocalRemote()
+
+        # A minority of CephManager fns actually bother locking for when
+        # certain teuthology tests want to run tasks in parallel
+        self.lock = threading.RLock()
+
+        self.log = lambda x: log.debug(x)
+
+        # Don't bother constructing a map of pools: it should be empty
+        # at test cluster start, and in any case it would be out of date
+        # in no time.  The attribute needs to exist for some of the CephManager
+        # methods to work though.
+        self.pools = {}
+
+    def find_remote(self, daemon_type, daemon_id):
+        """
+        daemon_type like 'mds', 'osd'
+        daemon_id like 'a', '0'
+        """
+        return LocalRemote()
+
+    def run_ceph_w(self, watch_channel=None):
+        """
+        :param watch_channel: Specifies the channel to be watched.
+                              This can be 'cluster', 'audit', ...
+        :type watch_channel: str
+        """
+        args = [os.path.join(BIN_PREFIX, "ceph"), "-w"]
+        if watch_channel is not None:
+            args.append("--watch-channel")
+            args.append(watch_channel)
+        proc = self.controller.run(args=args, wait=False, stdout=StringIO())
+        return proc
+
+    def run_cluster_cmd(self, **kwargs):
+        """
+        Run a Ceph command and the object representing the process for the
+        command.
+
+        Accepts arguments same as teuthology.orchestra.remote.run().
+        """
+        kwargs['args'] = [os.path.join(BIN_PREFIX,'ceph')]+list(kwargs['args'])
+        return self.controller.run(**kwargs)
+
+    def raw_cluster_cmd(self, *args, **kwargs) -> str:
+        """
+        args like ["osd", "dump"}
+        return stdout string
+        """
+        kwargs['args'] = args
+        if kwargs.get('stdout') is None:
+            kwargs['stdout'] = StringIO()
+        return self.run_cluster_cmd(**kwargs).stdout.getvalue()
+
+    def raw_cluster_cmd_result(self, *args, **kwargs):
+        """
+        like raw_cluster_cmd but don't check status, just return rc
+        """
+        kwargs['args'], kwargs['check_status'] = args, False
+        return self.run_cluster_cmd(**kwargs).exitstatus
+
+    def admin_socket(self, daemon_type, daemon_id, command, check_status=True,
+                     timeout=None, stdout=None):
+        if stdout is None:
+            stdout = StringIO()
+
+        return self.controller.run(
+            args=[os.path.join(BIN_PREFIX, "ceph"), "daemon",
+                  "{0}.{1}".format(daemon_type, daemon_id)] + command,
+            check_status=check_status, timeout=timeout, stdout=stdout)
+
+    def get_mon_socks(self):
+        """
+        Get monitor sockets.
+
+        :return socks: tuple of strings; strings are individual sockets.
+        """
+        from json import loads
+
+        output = loads(self.raw_cluster_cmd('--format=json', 'mon', 'dump'))
+        socks = []
+        for mon in output['mons']:
+            for addrvec_mem in mon['public_addrs']['addrvec']:
+                socks.append(addrvec_mem['addr'])
+        return tuple(socks)
+
+    def get_msgrv1_mon_socks(self):
+        """
+        Get monitor sockets that use msgrv2 to operate.
+
+        :return socks: tuple of strings; strings are individual sockets.
+        """
+        from json import loads
+
+        output = loads(self.raw_cluster_cmd('--format=json', 'mon', 'dump'))
+        socks = []
+        for mon in output['mons']:
+            for addrvec_mem in mon['public_addrs']['addrvec']:
+                if addrvec_mem['type'] == 'v1':
+                    socks.append(addrvec_mem['addr'])
+        return tuple(socks)
+
+    def get_msgrv2_mon_socks(self):
+        """
+        Get monitor sockets that use msgrv2 to operate.
+
+        :return socks: tuple of strings; strings are individual sockets.
+        """
+        from json import loads
+
+        output = loads(self.raw_cluster_cmd('--format=json', 'mon', 'dump'))
+        socks = []
+        for mon in output['mons']:
+            for addrvec_mem in mon['public_addrs']['addrvec']:
+                if addrvec_mem['type'] == 'v2':
+                    socks.append(addrvec_mem['addr'])
+        return tuple(socks)
+
+
+class LocalCephCluster(CephCluster):
+    def __init__(self, ctx):
+        # Deliberately skip calling CephCluster constructor
+        self._ctx = ctx
+        self.mon_manager = LocalCephManager()
+        self._conf = defaultdict(dict)
+
+    @property
+    def admin_remote(self):
+        return LocalRemote()
+
+    def get_config(self, key, service_type=None):
+        if service_type is None:
+            service_type = 'mon'
+
+        # FIXME hardcoded vstart service IDs
+        service_id = {
+            'mon': 'a',
+            'mds': 'a',
+            'osd': '0'
+        }[service_type]
+
+        return self.json_asok(['config', 'get', key], service_type, service_id)[key]
+
+    def _write_conf(self):
+        # In teuthology, we have the honour of writing the entire ceph.conf, but
+        # in vstart land it has mostly already been written and we need to carefully
+        # append to it.
+        conf_path = "./ceph.conf"
+        banner = "\n#LOCAL_TEST\n"
+        existing_str = open(conf_path).read()
+
+        if banner in existing_str:
+            existing_str = existing_str[0:existing_str.find(banner)]
+
+        existing_str += banner
+
+        for subsys, kvs in self._conf.items():
+            existing_str += "\n[{0}]\n".format(subsys)
+            for key, val in kvs.items():
+                # Comment out existing instance if it exists
+                log.debug("Searching for existing instance {0}/{1}".format(
+                    key, subsys
+                ))
+                existing_section = re.search("^\[{0}\]$([\n]|[^\[])+".format(
+                    subsys
+                ), existing_str, re.MULTILINE)
+
+                if existing_section:
+                    section_str = existing_str[existing_section.start():existing_section.end()]
+                    existing_val = re.search("^\s*[^#]({0}) =".format(key), section_str, re.MULTILINE)
+                    if existing_val:
+                        start = existing_section.start() + existing_val.start(1)
+                        log.debug("Found string to replace at {0}".format(
+                            start
+                        ))
+                        existing_str = existing_str[0:start] + "#" + existing_str[start:]
+
+                existing_str += "{0} = {1}\n".format(key, val)
+
+        open(conf_path, "w").write(existing_str)
+
+    def set_ceph_conf(self, subsys, key, value):
+        self._conf[subsys][key] = value
+        self._write_conf()
+
+    def clear_ceph_conf(self, subsys, key):
+        del self._conf[subsys][key]
+        self._write_conf()
+
+
+class LocalMDSCluster(LocalCephCluster, MDSCluster):
+    def __init__(self, ctx):
+        LocalCephCluster.__init__(self, ctx)
+        # Deliberately skip calling MDSCluster constructor
+        self._mds_ids = ctx.daemons.daemons['ceph.mds'].keys()
+        log.debug("Discovered MDS IDs: {0}".format(self._mds_ids))
+        self._mds_daemons = dict([(id_, LocalDaemon("mds", id_)) for id_ in self.mds_ids])
+
+    @property
+    def mds_ids(self):
+        return self._mds_ids
+
+    @property
+    def mds_daemons(self):
+        return self._mds_daemons
+
+    def clear_firewall(self):
+        # FIXME: unimplemented
+        pass
+
+    def newfs(self, name='cephfs', create=True):
+        return LocalFilesystem(self._ctx, name=name, create=create)
+
+    def delete_all_filesystems(self):
+        """
+        Remove all filesystems that exist, and any pools in use by them.
+        """
+        for fs in self.status().get_filesystems():
+            LocalFilesystem(ctx=self._ctx, fscid=fs['id']).destroy()
+
+
+class LocalMgrCluster(LocalCephCluster, MgrCluster):
+    def __init__(self, ctx):
+        super(LocalMgrCluster, self).__init__(ctx)
+
+        self.mgr_ids = ctx.daemons.daemons['ceph.mgr'].keys()
+        self.mgr_daemons = dict([(id_, LocalDaemon("mgr", id_)) for id_ in self.mgr_ids])
+
+
+class LocalFilesystem(LocalMDSCluster, Filesystem):
+    def __init__(self, ctx, fs_config={}, fscid=None, name=None, create=False):
+        # Deliberately skip calling Filesystem constructor
+        LocalMDSCluster.__init__(self, ctx)
+
+        self.id = None
+        self.name = name
+        self.metadata_pool_name = None
+        self.metadata_overlay = False
+        self.data_pool_name = None
+        self.data_pools = None
+        self.fs_config = fs_config
+        self.ec_profile = fs_config.get('ec_profile')
+
+        self.mon_manager = LocalCephManager()
+
+        self.client_remote = LocalRemote()
+
+        self._conf = defaultdict(dict)
+
+        if name is not None:
+            if fscid is not None:
+                raise RuntimeError("cannot specify fscid when creating fs")
+            if create and not self.legacy_configured():
+                self.create()
+        else:
+            if fscid is not None:
+                self.id = fscid
+                self.getinfo(refresh=True)
+
+        # Stash a reference to the first created filesystem on ctx, so
+        # that if someone drops to the interactive shell they can easily
+        # poke our methods.
+        if not hasattr(self._ctx, "filesystem"):
+            self._ctx.filesystem = self
+
+    @property
+    def _prefix(self):
+        return BIN_PREFIX
+
+    def set_clients_block(self, blocked, mds_id=None):
+        raise NotImplementedError()
+
+
+class LocalCluster(object):
+    def __init__(self, rolename="placeholder"):
+        self.remotes = {
+            LocalRemote(): [rolename]
+        }
+
+    def only(self, requested):
+        return self.__class__(rolename=requested)
+
+    def run(self, *args, **kwargs):
+        r = []
+        for remote in self.remotes.keys():
+            r.append(remote.run(*args, **kwargs))
+        return r
+
+
+class LocalContext(object):
+    def __init__(self):
+        self.config = {}
+        self.teuthology_config = teuth_config
+        self.cluster = LocalCluster()
+        self.daemons = DaemonGroup()
+
+        # Shove some LocalDaemons into the ctx.daemons DaemonGroup instance so that any
+        # tests that want to look these up via ctx can do so.
+        # Inspect ceph.conf to see what roles exist
+        for conf_line in open("ceph.conf").readlines():
+            for svc_type in ["mon", "osd", "mds", "mgr"]:
+                prefixed_type = "ceph." + svc_type
+                if prefixed_type not in self.daemons.daemons:
+                    self.daemons.daemons[prefixed_type] = {}
+                match = re.match("^\[{0}\.(.+)\]$".format(svc_type), conf_line)
+                if match:
+                    svc_id = match.group(1)
+                    self.daemons.daemons[prefixed_type][svc_id] = LocalDaemon(svc_type, svc_id)
+
+    def __del__(self):
+        test_path = self.teuthology_config['test_path']
+        # opt_create_cluster_only does not create the test path
+        if test_path:
+            shutil.rmtree(test_path)
+
+
+#########################################
+#
+# stuff necessary for launching tests...
+#
+#########################################
+
+
+def enumerate_methods(s):
+    log.debug("e: {0}".format(s))
+    for t in s._tests:
+        if isinstance(t, suite.BaseTestSuite):
+            for sub in enumerate_methods(t):
+                yield sub
+        else:
+            yield s, t
+
+
+def load_tests(modules, loader):
+    if modules:
+        log.debug("Executing modules: {0}".format(modules))
+        module_suites = []
+        for mod_name in modules:
+            # Test names like cephfs.test_auto_repair
+            module_suites.append(loader.loadTestsFromName(mod_name))
+        log.debug("Loaded: {0}".format(list(module_suites)))
+        return suite.TestSuite(module_suites)
+    else:
+        log.debug("Executing all cephfs tests")
+        return loader.discover(
+            os.path.join(os.path.dirname(os.path.abspath(__file__)), "cephfs")
+        )
+
+
+def scan_tests(modules):
+    overall_suite = load_tests(modules, loader.TestLoader())
+
+    max_required_mds = 0
+    max_required_clients = 0
+    max_required_mgr = 0
+    require_memstore = False
+
+    for suite_, case in enumerate_methods(overall_suite):
+        max_required_mds = max(max_required_mds,
+                               getattr(case, "MDSS_REQUIRED", 0))
+        max_required_clients = max(max_required_clients,
+                               getattr(case, "CLIENTS_REQUIRED", 0))
+        max_required_mgr = max(max_required_mgr,
+                               getattr(case, "MGRS_REQUIRED", 0))
+        require_memstore = getattr(case, "REQUIRE_MEMSTORE", False) \
+                               or require_memstore
+
+    return max_required_mds, max_required_clients, \
+            max_required_mgr, require_memstore
+
+
+class LogRotate():
+    def __init__(self):
+        self.conf_file_path = os.path.join(os.getcwd(), 'logrotate.conf')
+        self.state_file_path = os.path.join(os.getcwd(), 'logrotate.state')
+
+    def run_logrotate(self):
+        remote.run(args=['logrotate', '-f', self.conf_file_path, '-s',
+                         self.state_file_path, '--verbose'])
+
+
+def teardown_cluster():
+    log.info('\ntearing down the cluster...')
+    remote.run(args=[os.path.join(SRC_PREFIX, "stop.sh")], timeout=60)
+    remote.run(args=['rm', '-rf', './dev', './out'])
+
+
+def clear_old_log():
+    from os import stat
+
+    try:
+        stat(logpath)
+    # would need an update when making this py3 compatible. Use FileNotFound
+    # instead.
+    except OSError:
+        return
+    else:
+        os.remove(logpath)
+        with open(logpath, 'w') as logfile:
+            logfile.write('')
+        init_log()
+        log.debug('logging in a fresh file now...')
+
+
+class LogStream(object):
+    def __init__(self):
+        self.buffer = ""
+        self.omit_result_lines = False
+
+    def _del_result_lines(self):
+        """
+        Don't let unittest.TextTestRunner print "Ran X tests in Ys",
+        vstart_runner.py will do it for itself since it runs tests in a
+        testsuite one by one.
+        """
+        if self.omit_result_lines:
+            self.buffer = re.sub('-'*70+'\nran [0-9]* test in [0-9.]*s\n*',
+                                 '', self.buffer, flags=re.I)
+        self.buffer = re.sub('failed \(failures=[0-9]*\)\n', '', self.buffer,
+                             flags=re.I)
+        self.buffer = self.buffer.replace('OK\n', '')
+
+    def write(self, data):
+        self.buffer += data
+        if self.buffer.count("\n") > 5:
+            self._write()
+
+    def _write(self):
+        if opt_rotate_logs:
+            self._del_result_lines()
+        if self.buffer == '':
+            return
+
+        lines = self.buffer.split("\n")
+        for line in lines:
+            # sys.stderr.write(line + "\n")
+            log.info(line)
+        self.buffer = ''
+
+    def flush(self):
+        pass
+
+    def __del__(self):
+        self._write()
+
+
+class InteractiveFailureResult(unittest.TextTestResult):
+    """
+    Specialization that implements interactive-on-error style
+    behavior.
+    """
+    def addFailure(self, test, err):
+        super(InteractiveFailureResult, self).addFailure(test, err)
+        log.error(self._exc_info_to_string(err, test))
+        log.error("Failure in test '{0}', going interactive".format(
+            self.getDescription(test)
+        ))
+        interactive.task(ctx=None, config=None)
+
+    def addError(self, test, err):
+        super(InteractiveFailureResult, self).addError(test, err)
+        log.error(self._exc_info_to_string(err, test))
+        log.error("Error in test '{0}', going interactive".format(
+            self.getDescription(test)
+        ))
+        interactive.task(ctx=None, config=None)
+
+
+# XXX: class we require would be inherited from this one and one of
+# InteractiveFailureResult and unittestunittest.TextTestResult.
+class LoggingResultTemplate(object):
+    fail_on_skip = False
+
+    def startTest(self, test):
+        log.info("Starting test: {0}".format(self.getDescription(test)))
+        test.started_at = datetime.datetime.utcnow()
+        return super(LoggingResultTemplate, self).startTest(test)
+
+    def stopTest(self, test):
+        log.info("Stopped test: {0} in {1}s".format(
+            self.getDescription(test),
+            (datetime.datetime.utcnow() - test.started_at).total_seconds()
+        ))
+
+    def addSkip(self, test, reason):
+        if LoggingResultTemplate.fail_on_skip:
+            # Don't just call addFailure because that requires a traceback
+            self.failures.append((test, reason))
+        else:
+            super(LoggingResultTemplate, self).addSkip(test, reason)
+
+
+def launch_tests(overall_suite):
+    if opt_rotate_logs or not opt_exit_on_test_failure:
+        return launch_individually(overall_suite)
+    else:
+        return launch_entire_suite(overall_suite)
+
+
+def get_logging_result_class():
+    result_class = InteractiveFailureResult if opt_interactive_on_error else \
+        unittest.TextTestResult
+    return type('', (LoggingResultTemplate, result_class), {})
+
+
+def launch_individually(overall_suite):
+    no_of_tests_execed = 0
+    no_of_tests_failed, no_of_tests_execed = 0, 0
+    LoggingResult = get_logging_result_class()
+    stream = LogStream()
+    stream.omit_result_lines = True
+    if opt_rotate_logs:
+        logrotate = LogRotate()
+
+    started_at = datetime.datetime.utcnow()
+    for suite_, case in enumerate_methods(overall_suite):
+        # don't run logrotate beforehand since some ceph daemons might be
+        # down and pre/post-rotate scripts in logrotate.conf might fail.
+        if opt_rotate_logs:
+            logrotate.run_logrotate()
+
+        result = unittest.TextTestRunner(stream=stream,
+                                         resultclass=LoggingResult,
+                                         verbosity=2, failfast=True).run(case)
+
+        if not result.wasSuccessful():
+            if opt_exit_on_test_failure:
+                break
+            else:
+                no_of_tests_failed += 1
+
+        no_of_tests_execed += 1
+    time_elapsed = (datetime.datetime.utcnow() - started_at).total_seconds()
+
+    if result.wasSuccessful():
+        log.info('')
+        log.info('-'*70)
+        log.info(f'Ran {no_of_tests_execed} tests in {time_elapsed}s')
+        if no_of_tests_failed > 0:
+            log.info(f'{no_of_tests_failed} tests failed')
+        log.info('')
+        log.info('OK')
+
+    return result
+
+
+def launch_entire_suite(overall_suite):
+    LoggingResult = get_logging_result_class()
+
+    testrunner = unittest.TextTestRunner(stream=LogStream(),
+                                         resultclass=LoggingResult,
+                                         verbosity=2, failfast=True)
+    return testrunner.run(overall_suite)
+
+
+def exec_test():
+    # Parse arguments
+    global opt_interactive_on_error
+    opt_interactive_on_error = False
+    opt_create_cluster = False
+    opt_create_cluster_only = False
+    opt_ignore_missing_binaries = False
+    opt_teardown_cluster = False
+    global opt_log_ps_output
+    opt_log_ps_output = False
+    use_kernel_client = False
+    global opt_use_ns
+    opt_use_ns = False
+    opt_brxnet= None
+    opt_verbose = True
+    global opt_rotate_logs
+    opt_rotate_logs = False
+    global opt_exit_on_test_failure
+    opt_exit_on_test_failure = True
+
+    args = sys.argv[1:]
+    flags = [a for a in args if a.startswith("-")]
+    modules = [a for a in args if not a.startswith("-")]
+    for f in flags:
+        if f == "--interactive":
+            opt_interactive_on_error = True
+        elif f == "--create":
+            opt_create_cluster = True
+        elif f == "--create-cluster-only":
+            opt_create_cluster_only = True
+        elif f == "--ignore-missing-binaries":
+            opt_ignore_missing_binaries = True
+        elif f == '--teardown':
+            opt_teardown_cluster = True
+        elif f == '--log-ps-output':
+            opt_log_ps_output = True
+        elif f == '--clear-old-log':
+            clear_old_log()
+        elif f == "--kclient":
+            use_kernel_client = True
+        elif f == '--usens':
+            opt_use_ns = True
+        elif '--brxnet' in f:
+            if re.search(r'=[0-9./]+', f) is None:
+                log.error("--brxnet=<ip/mask> option needs one argument: '{0}'".format(f))
+                sys.exit(-1)
+            opt_brxnet=f.split('=')[1]
+            try:
+                IP(opt_brxnet)
+                if IP(opt_brxnet).iptype() == 'PUBLIC':
+                    raise RuntimeError('is public')
+            except Exception as e:
+                log.error("Invalid ip '{0}' {1}".format(opt_brxnet, e))
+                sys.exit(-1)
+        elif '--no-verbose' == f:
+            opt_verbose = False
+        elif f == '--rotate-logs':
+            opt_rotate_logs = True
+        elif f == '--run-all-tests':
+            opt_exit_on_test_failure = False
+        elif f == '--debug':
+            log.setLevel(logging.DEBUG)
+        else:
+            log.error("Unknown option '{0}'".format(f))
+            sys.exit(-1)
+
+    # Help developers by stopping up-front if their tree isn't built enough for all the
+    # tools that the tests might want to use (add more here if needed)
+    require_binaries = ["ceph-dencoder", "cephfs-journal-tool", "cephfs-data-scan",
+                        "cephfs-table-tool", "ceph-fuse", "rados", "cephfs-meta-injection"]
+    missing_binaries = [b for b in require_binaries if not os.path.exists(os.path.join(BIN_PREFIX, b))]
+    if missing_binaries and not opt_ignore_missing_binaries:
+        log.error("Some ceph binaries missing, please build them: {0}".format(" ".join(missing_binaries)))
+        sys.exit(-1)
+
+    max_required_mds, max_required_clients, \
+            max_required_mgr, require_memstore = scan_tests(modules)
+
+    global remote
+    remote = LocalRemote()
+
+    CephFSMount.cleanup_stale_netnses_and_bridge(remote)
+
+    # Tolerate no MDSs or clients running at start
+    ps_txt = remote.run(args=["ps", "-u"+str(os.getuid())],
+                        stdout=StringIO()).stdout.getvalue().strip()
+    lines = ps_txt.split("\n")[1:]
+    for line in lines:
+        if 'ceph-fuse' in line or 'ceph-mds' in line:
+            pid = int(line.split()[0])
+            log.warning("Killing stray process {0}".format(line))
+            os.kill(pid, signal.SIGKILL)
+
+    # Fire up the Ceph cluster if the user requested it
+    if opt_create_cluster or opt_create_cluster_only:
+        log.info("Creating cluster with {0} MDS daemons".format(
+            max_required_mds))
+        teardown_cluster()
+        vstart_env = os.environ.copy()
+        vstart_env["FS"] = "0"
+        vstart_env["MDS"] = max_required_mds.__str__()
+        vstart_env["OSD"] = "4"
+        vstart_env["MGR"] = max(max_required_mgr, 1).__str__()
+
+        args = [
+            os.path.join(SRC_PREFIX, "vstart.sh"),
+            "-n",
+            "--nolockdep",
+        ]
+        if require_memstore:
+            args.append("--memstore")
+
+        if opt_verbose:
+            args.append("-d")
+
+        # usually, i get vstart.sh running completely in less than 100
+        # seconds.
+        remote.run(args=args, env=vstart_env, timeout=(3 * 60))
+
+        # Wait for OSD to come up so that subsequent injectargs etc will
+        # definitely succeed
+        LocalCephCluster(LocalContext()).mon_manager.wait_for_all_osds_up(timeout=30)
+
+    if opt_create_cluster_only:
+        return
+
+    if opt_use_ns and mon_in_localhost() and not opt_create_cluster:
+        raise RuntimeError("cluster is on localhost; '--usens' option is incompatible. Or you can pass an extra '--create' option to create a new cluster without localhost!")
+
+    # List of client mounts, sufficient to run the selected tests
+    clients = [i.__str__() for i in range(0, max_required_clients)]
+
+    test_dir = tempfile.mkdtemp()
+    teuth_config['test_path'] = test_dir
+
+    ctx = LocalContext()
+    ceph_cluster = LocalCephCluster(ctx)
+    mds_cluster = LocalMDSCluster(ctx)
+    mgr_cluster = LocalMgrCluster(ctx)
+
+    # Construct Mount classes
+    mounts = []
+    for client_id in clients:
+        # Populate client keyring (it sucks to use client.admin for test clients
+        # because it's awkward to find the logs later)
+        client_name = "client.{0}".format(client_id)
+
+        if client_name not in open("./keyring").read():
+            p = remote.run(args=[os.path.join(BIN_PREFIX, "ceph"), "auth", "get-or-create", client_name,
+                                 "osd", "allow rw",
+                                 "mds", "allow",
+                                 "mon", "allow r"], stdout=StringIO())
+
+            open("./keyring", "at").write(p.stdout.getvalue())
+
+        if use_kernel_client:
+            mount = LocalKernelMount(ctx=ctx, test_dir=test_dir,
+                                     client_id=client_id, brxnet=opt_brxnet)
+        else:
+            mount = LocalFuseMount(ctx=ctx, test_dir=test_dir,
+                                   client_id=client_id, brxnet=opt_brxnet)
+
+        mounts.append(mount)
+        if os.path.exists(mount.hostfs_mntpt):
+            if mount.is_mounted():
+                log.warning("unmounting {0}".format(mount.hostfs_mntpt))
+                mount.umount_wait()
+            else:
+                os.rmdir(mount.hostfs_mntpt)
+
+    from tasks.cephfs_test_runner import DecoratingLoader
+
+    decorating_loader = DecoratingLoader({
+        "ctx": ctx,
+        "mounts": mounts,
+        "ceph_cluster": ceph_cluster,
+        "mds_cluster": mds_cluster,
+        "mgr_cluster": mgr_cluster,
+    })
+
+    # For the benefit of polling tests like test_full -- in teuthology land we set this
+    # in a .yaml, here it's just a hardcoded thing for the developer's pleasure.
+    remote.run(args=[os.path.join(BIN_PREFIX, "ceph"), "tell", "osd.*", "injectargs", "--osd-mon-report-interval", "5"])
+    ceph_cluster.set_ceph_conf("osd", "osd_mon_report_interval", "5")
+
+    # Vstart defaults to two segments, which very easily gets a "behind on trimming" health warning
+    # from normal IO latency.  Increase it for running teests.
+    ceph_cluster.set_ceph_conf("mds", "mds log max segments", "10")
+
+    # Make sure the filesystem created in tests has uid/gid that will let us talk to
+    # it after mounting it (without having to  go root).  Set in 'global' not just 'mds'
+    # so that cephfs-data-scan will pick it up too.
+    ceph_cluster.set_ceph_conf("global", "mds root ino uid", "%s" % os.getuid())
+    ceph_cluster.set_ceph_conf("global", "mds root ino gid", "%s" % os.getgid())
+
+    # Monkeypatch get_package_version to avoid having to work out what kind of distro we're on
+    def _get_package_version(remote, pkg_name):
+        # Used in cephfs tests to find fuse version.  Your development workstation *does* have >=2.9, right?
+        return "2.9"
+
+    import teuthology.packaging
+    teuthology.packaging.get_package_version = _get_package_version
+
+    overall_suite = load_tests(modules, decorating_loader)
+
+    # Filter out tests that don't lend themselves to interactive running,
+    victims = []
+    for case, method in enumerate_methods(overall_suite):
+        fn = getattr(method, method._testMethodName)
+
+        drop_test = False
+
+        if hasattr(fn, 'is_for_teuthology') and getattr(fn, 'is_for_teuthology') is True:
+            drop_test = True
+            log.warning("Dropping test because long running: {method_id}".format(method_id=method.id()))
+
+        if getattr(fn, "needs_trimming", False) is True:
+            drop_test = (os.getuid() != 0)
+            log.warning("Dropping test because client trim unavailable: {method_id}".format(method_id=method.id()))
+
+        if drop_test:
+            # Don't drop the test if it was explicitly requested in arguments
+            is_named = False
+            for named in modules:
+                if named.endswith(method.id()):
+                    is_named = True
+                    break
+
+            if not is_named:
+                victims.append((case, method))
+
+    log.debug("Disabling {0} tests because of is_for_teuthology or needs_trimming".format(len(victims)))
+    for s, method in victims:
+        s._tests.remove(method)
+
+    overall_suite = load_tests(modules, loader.TestLoader())
+    result = launch_tests(overall_suite)
+
+    CephFSMount.cleanup_stale_netnses_and_bridge(remote)
+    if opt_teardown_cluster:
+        teardown_cluster()
+
+    if not result.wasSuccessful():
+        # no point in duplicating if we can have multiple failures in same
+        # run.
+        if opt_exit_on_test_failure:
+            result.printErrors()  # duplicate output at end for convenience
+
+        bad_tests = []
+        for test, error in result.errors:
+            bad_tests.append(str(test))
+        for test, failure in result.failures:
+            bad_tests.append(str(test))
+
+        sys.exit(-1)
+    else:
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    exec_test()
diff --git a/qa/tasks/watch_notify_same_primary.py b/qa/tasks/watch_notify_same_primary.py
new file mode 100644
index 000000000..448fee193
--- /dev/null
+++ b/qa/tasks/watch_notify_same_primary.py
@@ -0,0 +1,129 @@
+
+"""
+watch_notify_same_primary task
+"""
+from io import StringIO
+import contextlib
+import logging
+
+
+from teuthology.orchestra import run
+from teuthology.contextutil import safe_while
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run watch_notify_same_primary
+
+    The config should be as follows:
+
+    watch_notify_same_primary:
+        clients: [client list]
+
+    The client list should contain 1 client
+
+    The test requires 3 osds.
+
+    example:
+
+    tasks:
+    - ceph:
+    - watch_notify_same_primary:
+        clients: [client.0]
+    - interactive:
+    """
+    log.info('Beginning watch_notify_same_primary...')
+    assert isinstance(config, dict), \
+        "please list clients to run on"
+
+    clients = config.get('clients', ['client.0'])
+    assert len(clients) == 1
+    role = clients[0]
+    assert isinstance(role, str)
+    PREFIX = 'client.'
+    assert role.startswith(PREFIX)
+    (remote,) = ctx.cluster.only(role).remotes.keys()
+    manager = ctx.managers['ceph']
+    manager.raw_cluster_cmd('osd', 'set', 'noout')
+
+    pool = manager.create_pool_with_unique_name()
+    def obj(n): return "foo-{num}".format(num=n)
+    def start_watch(n):
+        remote.run(
+            args = [
+                "rados",
+                "-p", pool,
+                "put",
+                obj(n),
+                "/etc/resolv.conf"],
+            logger=log.getChild('watch.{id}'.format(id=n)))
+        proc = remote.run(
+            args = [
+                "rados",
+                "-p", pool,
+                "watch",
+                obj(n)],
+            stdin=run.PIPE,
+            stdout=StringIO(),
+            stderr=StringIO(),
+            wait=False)
+        return proc
+
+    num = 20
+
+    watches = [start_watch(i) for i in range(num)]
+
+    # wait for them all to register
+    for i in range(num):
+        with safe_while() as proceed:
+            while proceed():
+                lines = remote.sh(
+                    ["rados", "-p", pool, "listwatchers", obj(i)])
+                num_watchers = lines.count('watcher=')
+                log.info('i see %d watchers for %s', num_watchers, obj(i))
+                if num_watchers >= 1:
+                    break
+
+    def notify(n, msg):
+        remote.run(
+            args = [
+                "rados",
+                "-p", pool,
+                "notify",
+                obj(n),
+                msg],
+            logger=log.getChild('notify.{id}'.format(id=n)))
+
+    [notify(n, 'notify1') for n in range(len(watches))]
+
+    manager.kill_osd(0)
+    manager.mark_down_osd(0)
+
+    [notify(n, 'notify2') for n in range(len(watches))]
+
+    try:
+        yield
+    finally:
+        log.info('joining watch_notify_stress')
+        for watch in watches:
+            watch.stdin.write("\n")
+
+        run.wait(watches)
+
+        for watch in watches:
+            lines = watch.stdout.getvalue().split("\n")
+            got1 = False
+            got2 = False
+            for l in lines:
+                if 'notify1' in l:
+                    got1 = True
+                if 'notify2' in l:
+                    got2 = True
+            log.info(lines)
+            assert got1 and got2
+
+        manager.revive_osd(0)
+        manager.remove_pool(pool)
diff --git a/qa/tasks/watch_notify_stress.py b/qa/tasks/watch_notify_stress.py
new file mode 100644
index 000000000..47747b1ca
--- /dev/null
+++ b/qa/tasks/watch_notify_stress.py
@@ -0,0 +1,69 @@
+"""
+test_stress_watch task
+"""
+import contextlib
+import logging
+
+from teuthology.orchestra import run
+from teuthology.task import proc_thrasher
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run test_stress_watch
+
+    The config should be as follows:
+
+    test_stress_watch:
+        clients: [client list]
+
+    example:
+
+    tasks:
+    - ceph:
+    - test_stress_watch:
+        clients: [client.0]
+    - interactive:
+    """
+    log.info('Beginning test_stress_watch...')
+    assert isinstance(config, dict), \
+        "please list clients to run on"
+    testwatch = {}
+
+    remotes = []
+
+    for role in config.get('clients', ['client.0']):
+        assert isinstance(role, str)
+        PREFIX = 'client.'
+        assert role.startswith(PREFIX)
+        id_ = role[len(PREFIX):]
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+        remotes.append(remote)
+
+        args =['CEPH_CLIENT_ID={id_}'.format(id_=id_),
+               'CEPH_ARGS="{flags}"'.format(flags=config.get('flags', '')),
+               'daemon-helper',
+               'kill',
+               'multi_stress_watch foo foo'
+               ]
+
+        log.info("args are %s" % (args,))
+
+        proc = proc_thrasher.ProcThrasher({}, remote,
+            args=[run.Raw(i) for i in args],
+            logger=log.getChild('testwatch.{id}'.format(id=id_)),
+            stdin=run.PIPE,
+            wait=False
+            )
+        proc.start()
+        testwatch[id_] = proc
+
+    try:
+        yield
+    finally:
+        log.info('joining watch_notify_stress')
+        for i in testwatch.values():
+            i.join()
diff --git a/qa/tasks/workunit.py b/qa/tasks/workunit.py
new file mode 100644
index 000000000..371d2a2dd
--- /dev/null
+++ b/qa/tasks/workunit.py
@@ -0,0 +1,438 @@
+"""
+Workunit task -- Run ceph on sets of specific clients
+"""
+import logging
+import pipes
+import os
+import re
+import shlex
+
+from tasks.util import get_remote_for_role
+from tasks.util.workunit import get_refspec_after_overrides
+
+from teuthology import misc
+from teuthology.config import config as teuth_config
+from teuthology.orchestra.run import CommandFailedError
+from teuthology.parallel import parallel
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Run ceph on all workunits found under the specified path.
+
+    For example::
+
+        tasks:
+        - ceph:
+        - ceph-fuse: [client.0]
+        - workunit:
+            clients:
+              client.0: [direct_io, xattrs.sh]
+              client.1: [snaps]
+            branch: foo
+
+    You can also run a list of workunits on all clients:
+        tasks:
+        - ceph:
+        - ceph-fuse:
+        - workunit:
+            tag: v0.47
+            clients:
+              all: [direct_io, xattrs.sh, snaps]
+
+    If you have an "all" section it will run all the workunits
+    on each client simultaneously, AFTER running any workunits specified
+    for individual clients. (This prevents unintended simultaneous runs.)
+
+    To customize tests, you can specify environment variables as a dict. You
+    can also specify a time limit for each work unit (defaults to 3h):
+
+        tasks:
+        - ceph:
+        - ceph-fuse:
+        - workunit:
+            sha1: 9b28948635b17165d17c1cf83d4a870bd138ddf6
+            clients:
+              all: [snaps]
+            env:
+              FOO: bar
+              BAZ: quux
+            timeout: 3h
+
+    You can also pass optional arguments to the found workunits:
+
+        tasks:
+        - workunit:
+            clients:
+              all:
+                - test-ceph-helpers.sh test_get_config
+
+    This task supports roles that include a ceph cluster, e.g.::
+
+        tasks:
+        - ceph:
+        - workunit:
+            clients:
+              backup.client.0: [foo]
+              client.1: [bar] # cluster is implicitly 'ceph'
+
+    You can also specify an alternative top-level dir to 'qa/workunits', like
+    'qa/standalone', with::
+
+        tasks:
+        - install:
+        - workunit:
+            basedir: qa/standalone
+            clients:
+              client.0:
+                - test-ceph-helpers.sh
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    assert isinstance(config, dict)
+    assert isinstance(config.get('clients'), dict), \
+        'configuration must contain a dictionary of clients'
+
+    overrides = ctx.config.get('overrides', {})
+    refspec = get_refspec_after_overrides(config, overrides)
+    timeout = config.get('timeout', '3h')
+    cleanup = config.get('cleanup', True)
+
+    log.info('Pulling workunits from ref %s', refspec)
+
+    created_mountpoint = {}
+
+    if config.get('env') is not None:
+        assert isinstance(config['env'], dict), 'env must be a dictionary'
+    clients = config['clients']
+
+    # Create scratch dirs for any non-all workunits
+    log.info('Making a separate scratch dir for every client...')
+    for role in clients.keys():
+        assert isinstance(role, str)
+        if role == "all":
+            continue
+
+        assert 'client' in role
+        created_mnt_dir = _make_scratch_dir(ctx, role, config.get('subdir'))
+        created_mountpoint[role] = created_mnt_dir
+
+    # Execute any non-all workunits
+    log.info("timeout={}".format(timeout))
+    log.info("cleanup={}".format(cleanup))
+    with parallel() as p:
+        for role, tests in clients.items():
+            if role != "all":
+                p.spawn(_run_tests, ctx, refspec, role, tests,
+                        config.get('env'),
+                        basedir=config.get('basedir','qa/workunits'),
+                        timeout=timeout,
+                        cleanup=cleanup,
+                        coverage_and_limits=not config.get('no_coverage_and_limits', None))
+
+    if cleanup:
+        # Clean up dirs from any non-all workunits
+        for role, created in created_mountpoint.items():
+            _delete_dir(ctx, role, created)
+
+    # Execute any 'all' workunits
+    if 'all' in clients:
+        all_tasks = clients["all"]
+        _spawn_on_all_clients(ctx, refspec, all_tasks, config.get('env'),
+                              config.get('basedir', 'qa/workunits'),
+                              config.get('subdir'), timeout=timeout,
+                              cleanup=cleanup)
+
+
+def _client_mountpoint(ctx, cluster, id_):
+    """
+    Returns the path to the expected mountpoint for workunits running
+    on some kind of filesystem.
+    """
+    # for compatibility with tasks like ceph-fuse that aren't cluster-aware yet,
+    # only include the cluster name in the dir if the cluster is not 'ceph'
+    if cluster == 'ceph':
+        dir_ = 'mnt.{0}'.format(id_)
+    else:
+        dir_ = 'mnt.{0}.{1}'.format(cluster, id_)
+    return os.path.join(misc.get_testdir(ctx), dir_)
+
+
+def _delete_dir(ctx, role, created_mountpoint):
+    """
+    Delete file used by this role, and delete the directory that this
+    role appeared in.
+
+    :param ctx: Context
+    :param role: "role.#" where # is used for the role id.
+    """
+    cluster, _, id_ = misc.split_role(role)
+    remote = get_remote_for_role(ctx, role)
+    mnt = _client_mountpoint(ctx, cluster, id_)
+    client = os.path.join(mnt, 'client.{id}'.format(id=id_))
+
+    # Remove the directory inside the mount where the workunit ran
+    remote.run(
+        args=[
+            'sudo',
+            'rm',
+            '-rf',
+            '--',
+            client,
+        ],
+    )
+    log.info("Deleted dir {dir}".format(dir=client))
+
+    # If the mount was an artificially created dir, delete that too
+    if created_mountpoint:
+        remote.run(
+            args=[
+                'rmdir',
+                '--',
+                mnt,
+            ],
+        )
+        log.info("Deleted artificial mount point {dir}".format(dir=client))
+
+
+def _make_scratch_dir(ctx, role, subdir):
+    """
+    Make scratch directories for this role.  This also makes the mount
+    point if that directory does not exist.
+
+    :param ctx: Context
+    :param role: "role.#" where # is used for the role id.
+    :param subdir: use this subdir (False if not used)
+    """
+    created_mountpoint = False
+    cluster, _, id_ = misc.split_role(role)
+    remote = get_remote_for_role(ctx, role)
+    dir_owner = remote.user
+    mnt = _client_mountpoint(ctx, cluster, id_)
+    # if neither kclient nor ceph-fuse are required for a workunit,
+    # mnt may not exist. Stat and create the directory if it doesn't.
+    try:
+        remote.run(
+            args=[
+                'stat',
+                '--',
+                mnt,
+            ],
+        )
+        log.info('Did not need to create dir {dir}'.format(dir=mnt))
+    except CommandFailedError:
+        remote.run(
+            args=[
+                'mkdir',
+                '--',
+                mnt,
+            ],
+        )
+        log.info('Created dir {dir}'.format(dir=mnt))
+        created_mountpoint = True
+
+    if not subdir:
+        subdir = 'client.{id}'.format(id=id_)
+
+    if created_mountpoint:
+        remote.run(
+            args=[
+                'cd',
+                '--',
+                mnt,
+                run.Raw('&&'),
+                'mkdir',
+                '--',
+                subdir,
+            ],
+        )
+    else:
+        remote.run(
+            args=[
+                # cd first so this will fail if the mount point does
+                # not exist; pure install -d will silently do the
+                # wrong thing
+                'cd',
+                '--',
+                mnt,
+                run.Raw('&&'),
+                'sudo',
+                'install',
+                '-d',
+                '-m', '0755',
+                '--owner={user}'.format(user=dir_owner),
+                '--',
+                subdir,
+            ],
+        )
+
+    return created_mountpoint
+
+
+def _spawn_on_all_clients(ctx, refspec, tests, env, basedir, subdir, timeout=None, cleanup=True):
+    """
+    Make a scratch directory for each client in the cluster, and then for each
+    test spawn _run_tests() for each role.
+
+    See run_tests() for parameter documentation.
+    """
+    is_client = misc.is_type('client')
+    client_remotes = {}
+    created_mountpoint = {}
+    for remote, roles_for_host in ctx.cluster.remotes.items():
+        for role in roles_for_host:
+            if is_client(role):
+                client_remotes[role] = remote
+                created_mountpoint[role] = _make_scratch_dir(ctx, role, subdir)
+
+    for unit in tests:
+        with parallel() as p:
+            for role, remote in client_remotes.items():
+                p.spawn(_run_tests, ctx, refspec, role, [unit], env,
+                        basedir,
+                        subdir,
+                        timeout=timeout)
+
+    # cleanup the generated client directories
+    if cleanup:
+        for role, _ in client_remotes.items():
+            _delete_dir(ctx, role, created_mountpoint[role])
+
+
+def _run_tests(ctx, refspec, role, tests, env, basedir,
+               subdir=None, timeout=None, cleanup=True,
+               coverage_and_limits=True):
+    """
+    Run the individual test. Create a scratch directory and then extract the
+    workunits from git. Make the executables, and then run the tests.
+    Clean up (remove files created) after the tests are finished.
+
+    :param ctx:     Context
+    :param refspec: branch, sha1, or version tag used to identify this
+                    build
+    :param tests:   specific tests specified.
+    :param env:     environment set in yaml file.  Could be None.
+    :param subdir:  subdirectory set in yaml file.  Could be None
+    :param timeout: If present, use the 'timeout' command on the remote host
+                    to limit execution time. Must be specified by a number
+                    followed by 's' for seconds, 'm' for minutes, 'h' for
+                    hours, or 'd' for days. If '0' or anything that evaluates
+                    to False is passed, the 'timeout' command is not used.
+    """
+    testdir = misc.get_testdir(ctx)
+    assert isinstance(role, str)
+    cluster, type_, id_ = misc.split_role(role)
+    assert type_ == 'client'
+    remote = get_remote_for_role(ctx, role)
+    mnt = _client_mountpoint(ctx, cluster, id_)
+    # subdir so we can remove and recreate this a lot without sudo
+    if subdir is None:
+        scratch_tmp = os.path.join(mnt, 'client.{id}'.format(id=id_), 'tmp')
+    else:
+        scratch_tmp = os.path.join(mnt, subdir)
+    clonedir = '{tdir}/clone.{role}'.format(tdir=testdir, role=role)
+    srcdir = '{cdir}/{basedir}'.format(cdir=clonedir,
+                                       basedir=basedir)
+
+    git_url = teuth_config.get_ceph_qa_suite_git_url()
+    # if we are running an upgrade test, and ceph-ci does not have branches like
+    # `jewel`, so should use ceph.git as an alternative.
+    try:
+        remote.run(logger=log.getChild(role),
+                   args=refspec.clone(git_url, clonedir))
+    except CommandFailedError:
+        if git_url.endswith('/ceph-ci.git'):
+            alt_git_url = git_url.replace('/ceph-ci.git', '/ceph.git')
+        elif git_url.endswith('/ceph-ci'):
+            alt_git_url = re.sub(r'/ceph-ci$', '/ceph.git', git_url)
+        else:
+            raise
+        log.info(
+            "failed to check out '%s' from %s; will also try in %s",
+            refspec,
+            git_url,
+            alt_git_url,
+        )
+        remote.run(logger=log.getChild(role),
+                   args=refspec.clone(alt_git_url, clonedir))
+    remote.run(
+        logger=log.getChild(role),
+        args=[
+            'cd', '--', srcdir,
+            run.Raw('&&'),
+            'if', 'test', '-e', 'Makefile', run.Raw(';'), 'then', 'make', run.Raw(';'), 'fi',
+            run.Raw('&&'),
+            'find', '-executable', '-type', 'f', '-printf', r'%P\0',
+            run.Raw('>{tdir}/workunits.list.{role}'.format(tdir=testdir, role=role)),
+        ],
+    )
+
+    workunits_file = '{tdir}/workunits.list.{role}'.format(tdir=testdir, role=role)
+    workunits = sorted(remote.read_file(workunits_file).decode().split('\0'))
+    assert workunits
+
+    try:
+        assert isinstance(tests, list)
+        for spec in tests:
+            dir_or_fname, *optional_args = shlex.split(spec)
+            log.info('Running workunits matching %s on %s...', dir_or_fname, role)
+            # match executables named "foo" or "foo/*" with workunit named
+            # "foo"
+            to_run = [w for w in workunits
+                      if os.path.commonpath([w, dir_or_fname]) == dir_or_fname]
+            if not to_run:
+                raise RuntimeError('Spec did not match any workunits: {spec!r}'.format(spec=spec))
+            for workunit in to_run:
+                log.info('Running workunit %s...', workunit)
+                args = [
+                    'mkdir', '-p', '--', scratch_tmp,
+                    run.Raw('&&'),
+                    'cd', '--', scratch_tmp,
+                    run.Raw('&&'),
+                    run.Raw('CEPH_CLI_TEST_DUP_COMMAND=1'),
+                    run.Raw('CEPH_REF={ref}'.format(ref=refspec)),
+                    run.Raw('TESTDIR="{tdir}"'.format(tdir=testdir)),
+                    run.Raw('CEPH_ARGS="--cluster {0}"'.format(cluster)),
+                    run.Raw('CEPH_ID="{id}"'.format(id=id_)),
+                    run.Raw('PATH=$PATH:/usr/sbin'),
+                    run.Raw('CEPH_BASE={dir}'.format(dir=clonedir)),
+                    run.Raw('CEPH_ROOT={dir}'.format(dir=clonedir)),
+                    run.Raw('CEPH_MNT={dir}'.format(dir=mnt)),
+                ]
+                if env is not None:
+                    for var, val in env.items():
+                        quoted_val = pipes.quote(val)
+                        env_arg = '{var}={val}'.format(var=var, val=quoted_val)
+                        args.append(run.Raw(env_arg))
+                if coverage_and_limits:
+                    args.extend([
+                        'adjust-ulimits',
+                        'ceph-coverage',
+                        '{tdir}/archive/coverage'.format(tdir=testdir)])
+                if timeout and timeout != '0':
+                    args.extend(['timeout', timeout])
+                args.extend([
+                    '{srcdir}/{workunit}'.format(
+                        srcdir=srcdir,
+                        workunit=workunit,
+                    ),
+                ])
+                remote.run(
+                    logger=log.getChild(role),
+                    args=args + optional_args,
+                    label="workunit test {workunit}".format(workunit=workunit)
+                )
+                if cleanup:
+                    args=['sudo', 'rm', '-rf', '--', scratch_tmp]
+                    remote.run(logger=log.getChild(role), args=args, timeout=(60*60))
+    finally:
+        log.info('Stopping %s on %s...', tests, role)
+        args=['sudo', 'rm', '-rf', '--', workunits_file, clonedir]
+        # N.B. don't cleanup scratch_tmp! If the mount is broken then rm will hang.
+        remote.run(
+            logger=log.getChild(role),
+            args=args,
+        )