1 files changed, 423 insertions, 0 deletions
diff --git a/qa/tasks/workunit.py b/qa/tasks/workunit.py
new file mode 100644
index 00000000..5a767038
--- /dev/null
+++ b/qa/tasks/workunit.py
@@ -0,0 +1,423 @@
+"""
+Workunit task -- Run ceph on sets of specific clients
+"""
+import logging
+import pipes
+import os
+import re
+
+import six
+
+from tasks.util import get_remote_for_role
+from tasks.util.workunit import get_refspec_after_overrides
+
+from teuthology import misc
+from teuthology.config import config as teuth_config
+from teuthology.orchestra.run import CommandFailedError
+from teuthology.parallel import parallel
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Run ceph on all workunits found under the specified path.
+
+    For example::
+
+        tasks:
+        - ceph:
+        - ceph-fuse: [client.0]
+        - workunit:
+            clients:
+              client.0: [direct_io, xattrs.sh]
+              client.1: [snaps]
+            branch: foo
+
+    You can also run a list of workunits on all clients:
+        tasks:
+        - ceph:
+        - ceph-fuse:
+        - workunit:
+            tag: v0.47
+            clients:
+              all: [direct_io, xattrs.sh, snaps]
+
+    If you have an "all" section it will run all the workunits
+    on each client simultaneously, AFTER running any workunits specified
+    for individual clients. (This prevents unintended simultaneous runs.)
+
+    To customize tests, you can specify environment variables as a dict. You
+    can also specify a time limit for each work unit (defaults to 3h):
+
+        tasks:
+        - ceph:
+        - ceph-fuse:
+        - workunit:
+            sha1: 9b28948635b17165d17c1cf83d4a870bd138ddf6
+            clients:
+              all: [snaps]
+            env:
+              FOO: bar
+              BAZ: quux
+            timeout: 3h
+
+    This task supports roles that include a ceph cluster, e.g.::
+
+        tasks:
+        - ceph:
+        - workunit:
+            clients:
+              backup.client.0: [foo]
+              client.1: [bar] # cluster is implicitly 'ceph'
+
+    You can also specify an alternative top-level dir to 'qa/workunits', like
+    'qa/standalone', with::
+
+        tasks:
+        - install:
+        - workunit:
+            basedir: qa/standalone
+            clients:
+              client.0:
+                - test-ceph-helpers.sh
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    assert isinstance(config, dict)
+    assert isinstance(config.get('clients'), dict), \
+        'configuration must contain a dictionary of clients'
+
+    overrides = ctx.config.get('overrides', {})
+    refspec = get_refspec_after_overrides(config, overrides)
+    timeout = config.get('timeout', '3h')
+    cleanup = config.get('cleanup', True)
+
+    log.info('Pulling workunits from ref %s', refspec)
+
+    created_mountpoint = {}
+
+    if config.get('env') is not None:
+        assert isinstance(config['env'], dict), 'env must be a dictionary'
+    clients = config['clients']
+
+    # Create scratch dirs for any non-all workunits
+    log.info('Making a separate scratch dir for every client...')
+    for role in clients.keys():
+        assert isinstance(role, six.string_types)
+        if role == "all":
+            continue
+
+        assert 'client' in role
+        created_mnt_dir = _make_scratch_dir(ctx, role, config.get('subdir'))
+        created_mountpoint[role] = created_mnt_dir
+
+    # Execute any non-all workunits
+    log.info("timeout={}".format(timeout))
+    log.info("cleanup={}".format(cleanup))
+    with parallel() as p:
+        for role, tests in clients.items():
+            if role != "all":
+                p.spawn(_run_tests, ctx, refspec, role, tests,
+                        config.get('env'),
+                        basedir=config.get('basedir','qa/workunits'),
+                        timeout=timeout,cleanup=cleanup)
+
+    if cleanup:
+        # Clean up dirs from any non-all workunits
+        for role, created in created_mountpoint.items():
+            _delete_dir(ctx, role, created)
+
+    # Execute any 'all' workunits
+    if 'all' in clients:
+        all_tasks = clients["all"]
+        _spawn_on_all_clients(ctx, refspec, all_tasks, config.get('env'),
+                              config.get('basedir', 'qa/workunits'),
+                              config.get('subdir'), timeout=timeout,
+                              cleanup=cleanup)
+
+
+def _client_mountpoint(ctx, cluster, id_):
+    """
+    Returns the path to the expected mountpoint for workunits running
+    on some kind of filesystem.
+    """
+    # for compatibility with tasks like ceph-fuse that aren't cluster-aware yet,
+    # only include the cluster name in the dir if the cluster is not 'ceph'
+    if cluster == 'ceph':
+        dir_ = 'mnt.{0}'.format(id_)
+    else:
+        dir_ = 'mnt.{0}.{1}'.format(cluster, id_)
+    return os.path.join(misc.get_testdir(ctx), dir_)
+
+
+def _delete_dir(ctx, role, created_mountpoint):
+    """
+    Delete file used by this role, and delete the directory that this
+    role appeared in.
+
+    :param ctx: Context
+    :param role: "role.#" where # is used for the role id.
+    """
+    cluster, _, id_ = misc.split_role(role)
+    remote = get_remote_for_role(ctx, role)
+    mnt = _client_mountpoint(ctx, cluster, id_)
+    client = os.path.join(mnt, 'client.{id}'.format(id=id_))
+
+    # Remove the directory inside the mount where the workunit ran
+    remote.run(
+        args=[
+            'sudo',
+            'rm',
+            '-rf',
+            '--',
+            client,
+        ],
+    )
+    log.info("Deleted dir {dir}".format(dir=client))
+
+    # If the mount was an artificially created dir, delete that too
+    if created_mountpoint:
+        remote.run(
+            args=[
+                'rmdir',
+                '--',
+                mnt,
+            ],
+        )
+        log.info("Deleted artificial mount point {dir}".format(dir=client))
+
+
+def _make_scratch_dir(ctx, role, subdir):
+    """
+    Make scratch directories for this role.  This also makes the mount
+    point if that directory does not exist.
+
+    :param ctx: Context
+    :param role: "role.#" where # is used for the role id.
+    :param subdir: use this subdir (False if not used)
+    """
+    created_mountpoint = False
+    cluster, _, id_ = misc.split_role(role)
+    remote = get_remote_for_role(ctx, role)
+    dir_owner = remote.user
+    mnt = _client_mountpoint(ctx, cluster, id_)
+    # if neither kclient nor ceph-fuse are required for a workunit,
+    # mnt may not exist. Stat and create the directory if it doesn't.
+    try:
+        remote.run(
+            args=[
+                'stat',
+                '--',
+                mnt,
+            ],
+        )
+        log.info('Did not need to create dir {dir}'.format(dir=mnt))
+    except CommandFailedError:
+        remote.run(
+            args=[
+                'mkdir',
+                '--',
+                mnt,
+            ],
+        )
+        log.info('Created dir {dir}'.format(dir=mnt))
+        created_mountpoint = True
+
+    if not subdir:
+        subdir = 'client.{id}'.format(id=id_)
+
+    if created_mountpoint:
+        remote.run(
+            args=[
+                'cd',
+                '--',
+                mnt,
+                run.Raw('&&'),
+                'mkdir',
+                '--',
+                subdir,
+            ],
+        )
+    else:
+        remote.run(
+            args=[
+                # cd first so this will fail if the mount point does
+                # not exist; pure install -d will silently do the
+                # wrong thing
+                'cd',
+                '--',
+                mnt,
+                run.Raw('&&'),
+                'sudo',
+                'install',
+                '-d',
+                '-m', '0755',
+                '--owner={user}'.format(user=dir_owner),
+                '--',
+                subdir,
+            ],
+        )
+
+    return created_mountpoint
+
+
+def _spawn_on_all_clients(ctx, refspec, tests, env, basedir, subdir, timeout=None, cleanup=True):
+    """
+    Make a scratch directory for each client in the cluster, and then for each
+    test spawn _run_tests() for each role.
+
+    See run_tests() for parameter documentation.
+    """
+    is_client = misc.is_type('client')
+    client_remotes = {}
+    created_mountpoint = {}
+    for remote, roles_for_host in ctx.cluster.remotes.items():
+        for role in roles_for_host:
+            if is_client(role):
+                client_remotes[role] = remote
+                created_mountpoint[role] = _make_scratch_dir(ctx, role, subdir)
+
+    for unit in tests:
+        with parallel() as p:
+            for role, remote in client_remotes.items():
+                p.spawn(_run_tests, ctx, refspec, role, [unit], env,
+                        basedir,
+                        subdir,
+                        timeout=timeout)
+
+    # cleanup the generated client directories
+    if cleanup:
+        for role, _ in client_remotes.items():
+            _delete_dir(ctx, role, created_mountpoint[role])
+
+
+def _run_tests(ctx, refspec, role, tests, env, basedir,
+               subdir=None, timeout=None, cleanup=True):
+    """
+    Run the individual test. Create a scratch directory and then extract the
+    workunits from git. Make the executables, and then run the tests.
+    Clean up (remove files created) after the tests are finished.
+
+    :param ctx:     Context
+    :param refspec: branch, sha1, or version tag used to identify this
+                    build
+    :param tests:   specific tests specified.
+    :param env:     environment set in yaml file.  Could be None.
+    :param subdir:  subdirectory set in yaml file.  Could be None
+    :param timeout: If present, use the 'timeout' command on the remote host
+                    to limit execution time. Must be specified by a number
+                    followed by 's' for seconds, 'm' for minutes, 'h' for
+                    hours, or 'd' for days. If '0' or anything that evaluates
+                    to False is passed, the 'timeout' command is not used.
+    """
+    testdir = misc.get_testdir(ctx)
+    assert isinstance(role, six.string_types)
+    cluster, type_, id_ = misc.split_role(role)
+    assert type_ == 'client'
+    remote = get_remote_for_role(ctx, role)
+    mnt = _client_mountpoint(ctx, cluster, id_)
+    # subdir so we can remove and recreate this a lot without sudo
+    if subdir is None:
+        scratch_tmp = os.path.join(mnt, 'client.{id}'.format(id=id_), 'tmp')
+    else:
+        scratch_tmp = os.path.join(mnt, subdir)
+    clonedir = '{tdir}/clone.{role}'.format(tdir=testdir, role=role)
+    srcdir = '{cdir}/{basedir}'.format(cdir=clonedir,
+                                       basedir=basedir)
+
+    git_url = teuth_config.get_ceph_qa_suite_git_url()
+    # if we are running an upgrade test, and ceph-ci does not have branches like
+    # `jewel`, so should use ceph.git as an alternative.
+    try:
+        remote.run(logger=log.getChild(role),
+                   args=refspec.clone(git_url, clonedir))
+    except CommandFailedError:
+        if git_url.endswith('/ceph-ci.git'):
+            alt_git_url = git_url.replace('/ceph-ci.git', '/ceph.git')
+        elif git_url.endswith('/ceph-ci'):
+            alt_git_url = re.sub(r'/ceph-ci$', '/ceph.git', git_url)
+        else:
+            raise
+        log.info(
+            "failed to check out '%s' from %s; will also try in %s",
+            refspec,
+            git_url,
+            alt_git_url,
+        )
+        remote.run(logger=log.getChild(role),
+                   args=refspec.clone(alt_git_url, clonedir))
+    remote.run(
+        logger=log.getChild(role),
+        args=[
+            'cd', '--', srcdir,
+            run.Raw('&&'),
+            'if', 'test', '-e', 'Makefile', run.Raw(';'), 'then', 'make', run.Raw(';'), 'fi',
+            run.Raw('&&'),
+            'find', '-executable', '-type', 'f', '-printf', r'%P\0'.format(srcdir=srcdir),
+            run.Raw('>{tdir}/workunits.list.{role}'.format(tdir=testdir, role=role)),
+        ],
+    )
+
+    workunits_file = '{tdir}/workunits.list.{role}'.format(tdir=testdir, role=role)
+    workunits = sorted(six.ensure_str(misc.get_file(remote, workunits_file)).split('\0'))
+    assert workunits
+
+    try:
+        assert isinstance(tests, list)
+        for spec in tests:
+            log.info('Running workunits matching %s on %s...', spec, role)
+            prefix = '{spec}/'.format(spec=spec)
+            to_run = [w for w in workunits if w == spec or w.startswith(prefix)]
+            if not to_run:
+                raise RuntimeError('Spec did not match any workunits: {spec!r}'.format(spec=spec))
+            for workunit in to_run:
+                log.info('Running workunit %s...', workunit)
+                args = [
+                    'mkdir', '-p', '--', scratch_tmp,
+                    run.Raw('&&'),
+                    'cd', '--', scratch_tmp,
+                    run.Raw('&&'),
+                    run.Raw('CEPH_CLI_TEST_DUP_COMMAND=1'),
+                    run.Raw('CEPH_REF={ref}'.format(ref=refspec)),
+                    run.Raw('TESTDIR="{tdir}"'.format(tdir=testdir)),
+                    run.Raw('CEPH_ARGS="--cluster {0}"'.format(cluster)),
+                    run.Raw('CEPH_ID="{id}"'.format(id=id_)),
+                    run.Raw('PATH=$PATH:/usr/sbin'),
+                    run.Raw('CEPH_BASE={dir}'.format(dir=clonedir)),
+                    run.Raw('CEPH_ROOT={dir}'.format(dir=clonedir)),
+                ]
+                if env is not None:
+                    for var, val in env.items():
+                        quoted_val = pipes.quote(val)
+                        env_arg = '{var}={val}'.format(var=var, val=quoted_val)
+                        args.append(run.Raw(env_arg))
+                args.extend([
+                    'adjust-ulimits',
+                    'ceph-coverage',
+                    '{tdir}/archive/coverage'.format(tdir=testdir)])
+                if timeout and timeout != '0':
+                    args.extend(['timeout', timeout])
+                args.extend([
+                    '{srcdir}/{workunit}'.format(
+                        srcdir=srcdir,
+                        workunit=workunit,
+                    ),
+                ])
+                remote.run(
+                    logger=log.getChild(role),
+                    args=args,
+                    label="workunit test {workunit}".format(workunit=workunit)
+                )
+                if cleanup:
+                    args=['sudo', 'rm', '-rf', '--', scratch_tmp]
+                    remote.run(logger=log.getChild(role), args=args, timeout=(60*60))
+    finally:
+        log.info('Stopping %s on %s...', tests, role)
+        args=['sudo', 'rm', '-rf', '--', workunits_file, clonedir]
+        # N.B. don't cleanup scratch_tmp! If the mount is broken then rm will hang.
+        remote.run(
+            logger=log.getChild(role),
+            args=args,
+        )