diff options
Diffstat (limited to '')
-rw-r--r-- | qa/tasks/workunit.py | 423 |
1 files changed, 423 insertions, 0 deletions
diff --git a/qa/tasks/workunit.py b/qa/tasks/workunit.py new file mode 100644 index 00000000..5a767038 --- /dev/null +++ b/qa/tasks/workunit.py @@ -0,0 +1,423 @@ +""" +Workunit task -- Run ceph on sets of specific clients +""" +import logging +import pipes +import os +import re + +import six + +from tasks.util import get_remote_for_role +from tasks.util.workunit import get_refspec_after_overrides + +from teuthology import misc +from teuthology.config import config as teuth_config +from teuthology.orchestra.run import CommandFailedError +from teuthology.parallel import parallel +from teuthology.orchestra import run + +log = logging.getLogger(__name__) + +def task(ctx, config): + """ + Run ceph on all workunits found under the specified path. + + For example:: + + tasks: + - ceph: + - ceph-fuse: [client.0] + - workunit: + clients: + client.0: [direct_io, xattrs.sh] + client.1: [snaps] + branch: foo + + You can also run a list of workunits on all clients: + tasks: + - ceph: + - ceph-fuse: + - workunit: + tag: v0.47 + clients: + all: [direct_io, xattrs.sh, snaps] + + If you have an "all" section it will run all the workunits + on each client simultaneously, AFTER running any workunits specified + for individual clients. (This prevents unintended simultaneous runs.) + + To customize tests, you can specify environment variables as a dict. You + can also specify a time limit for each work unit (defaults to 3h): + + tasks: + - ceph: + - ceph-fuse: + - workunit: + sha1: 9b28948635b17165d17c1cf83d4a870bd138ddf6 + clients: + all: [snaps] + env: + FOO: bar + BAZ: quux + timeout: 3h + + This task supports roles that include a ceph cluster, e.g.:: + + tasks: + - ceph: + - workunit: + clients: + backup.client.0: [foo] + client.1: [bar] # cluster is implicitly 'ceph' + + You can also specify an alternative top-level dir to 'qa/workunits', like + 'qa/standalone', with:: + + tasks: + - install: + - workunit: + basedir: qa/standalone + clients: + client.0: + - test-ceph-helpers.sh + + :param ctx: Context + :param config: Configuration + """ + assert isinstance(config, dict) + assert isinstance(config.get('clients'), dict), \ + 'configuration must contain a dictionary of clients' + + overrides = ctx.config.get('overrides', {}) + refspec = get_refspec_after_overrides(config, overrides) + timeout = config.get('timeout', '3h') + cleanup = config.get('cleanup', True) + + log.info('Pulling workunits from ref %s', refspec) + + created_mountpoint = {} + + if config.get('env') is not None: + assert isinstance(config['env'], dict), 'env must be a dictionary' + clients = config['clients'] + + # Create scratch dirs for any non-all workunits + log.info('Making a separate scratch dir for every client...') + for role in clients.keys(): + assert isinstance(role, six.string_types) + if role == "all": + continue + + assert 'client' in role + created_mnt_dir = _make_scratch_dir(ctx, role, config.get('subdir')) + created_mountpoint[role] = created_mnt_dir + + # Execute any non-all workunits + log.info("timeout={}".format(timeout)) + log.info("cleanup={}".format(cleanup)) + with parallel() as p: + for role, tests in clients.items(): + if role != "all": + p.spawn(_run_tests, ctx, refspec, role, tests, + config.get('env'), + basedir=config.get('basedir','qa/workunits'), + timeout=timeout,cleanup=cleanup) + + if cleanup: + # Clean up dirs from any non-all workunits + for role, created in created_mountpoint.items(): + _delete_dir(ctx, role, created) + + # Execute any 'all' workunits + if 'all' in clients: + all_tasks = clients["all"] + _spawn_on_all_clients(ctx, refspec, all_tasks, config.get('env'), + config.get('basedir', 'qa/workunits'), + config.get('subdir'), timeout=timeout, + cleanup=cleanup) + + +def _client_mountpoint(ctx, cluster, id_): + """ + Returns the path to the expected mountpoint for workunits running + on some kind of filesystem. + """ + # for compatibility with tasks like ceph-fuse that aren't cluster-aware yet, + # only include the cluster name in the dir if the cluster is not 'ceph' + if cluster == 'ceph': + dir_ = 'mnt.{0}'.format(id_) + else: + dir_ = 'mnt.{0}.{1}'.format(cluster, id_) + return os.path.join(misc.get_testdir(ctx), dir_) + + +def _delete_dir(ctx, role, created_mountpoint): + """ + Delete file used by this role, and delete the directory that this + role appeared in. + + :param ctx: Context + :param role: "role.#" where # is used for the role id. + """ + cluster, _, id_ = misc.split_role(role) + remote = get_remote_for_role(ctx, role) + mnt = _client_mountpoint(ctx, cluster, id_) + client = os.path.join(mnt, 'client.{id}'.format(id=id_)) + + # Remove the directory inside the mount where the workunit ran + remote.run( + args=[ + 'sudo', + 'rm', + '-rf', + '--', + client, + ], + ) + log.info("Deleted dir {dir}".format(dir=client)) + + # If the mount was an artificially created dir, delete that too + if created_mountpoint: + remote.run( + args=[ + 'rmdir', + '--', + mnt, + ], + ) + log.info("Deleted artificial mount point {dir}".format(dir=client)) + + +def _make_scratch_dir(ctx, role, subdir): + """ + Make scratch directories for this role. This also makes the mount + point if that directory does not exist. + + :param ctx: Context + :param role: "role.#" where # is used for the role id. + :param subdir: use this subdir (False if not used) + """ + created_mountpoint = False + cluster, _, id_ = misc.split_role(role) + remote = get_remote_for_role(ctx, role) + dir_owner = remote.user + mnt = _client_mountpoint(ctx, cluster, id_) + # if neither kclient nor ceph-fuse are required for a workunit, + # mnt may not exist. Stat and create the directory if it doesn't. + try: + remote.run( + args=[ + 'stat', + '--', + mnt, + ], + ) + log.info('Did not need to create dir {dir}'.format(dir=mnt)) + except CommandFailedError: + remote.run( + args=[ + 'mkdir', + '--', + mnt, + ], + ) + log.info('Created dir {dir}'.format(dir=mnt)) + created_mountpoint = True + + if not subdir: + subdir = 'client.{id}'.format(id=id_) + + if created_mountpoint: + remote.run( + args=[ + 'cd', + '--', + mnt, + run.Raw('&&'), + 'mkdir', + '--', + subdir, + ], + ) + else: + remote.run( + args=[ + # cd first so this will fail if the mount point does + # not exist; pure install -d will silently do the + # wrong thing + 'cd', + '--', + mnt, + run.Raw('&&'), + 'sudo', + 'install', + '-d', + '-m', '0755', + '--owner={user}'.format(user=dir_owner), + '--', + subdir, + ], + ) + + return created_mountpoint + + +def _spawn_on_all_clients(ctx, refspec, tests, env, basedir, subdir, timeout=None, cleanup=True): + """ + Make a scratch directory for each client in the cluster, and then for each + test spawn _run_tests() for each role. + + See run_tests() for parameter documentation. + """ + is_client = misc.is_type('client') + client_remotes = {} + created_mountpoint = {} + for remote, roles_for_host in ctx.cluster.remotes.items(): + for role in roles_for_host: + if is_client(role): + client_remotes[role] = remote + created_mountpoint[role] = _make_scratch_dir(ctx, role, subdir) + + for unit in tests: + with parallel() as p: + for role, remote in client_remotes.items(): + p.spawn(_run_tests, ctx, refspec, role, [unit], env, + basedir, + subdir, + timeout=timeout) + + # cleanup the generated client directories + if cleanup: + for role, _ in client_remotes.items(): + _delete_dir(ctx, role, created_mountpoint[role]) + + +def _run_tests(ctx, refspec, role, tests, env, basedir, + subdir=None, timeout=None, cleanup=True): + """ + Run the individual test. Create a scratch directory and then extract the + workunits from git. Make the executables, and then run the tests. + Clean up (remove files created) after the tests are finished. + + :param ctx: Context + :param refspec: branch, sha1, or version tag used to identify this + build + :param tests: specific tests specified. + :param env: environment set in yaml file. Could be None. + :param subdir: subdirectory set in yaml file. Could be None + :param timeout: If present, use the 'timeout' command on the remote host + to limit execution time. Must be specified by a number + followed by 's' for seconds, 'm' for minutes, 'h' for + hours, or 'd' for days. If '0' or anything that evaluates + to False is passed, the 'timeout' command is not used. + """ + testdir = misc.get_testdir(ctx) + assert isinstance(role, six.string_types) + cluster, type_, id_ = misc.split_role(role) + assert type_ == 'client' + remote = get_remote_for_role(ctx, role) + mnt = _client_mountpoint(ctx, cluster, id_) + # subdir so we can remove and recreate this a lot without sudo + if subdir is None: + scratch_tmp = os.path.join(mnt, 'client.{id}'.format(id=id_), 'tmp') + else: + scratch_tmp = os.path.join(mnt, subdir) + clonedir = '{tdir}/clone.{role}'.format(tdir=testdir, role=role) + srcdir = '{cdir}/{basedir}'.format(cdir=clonedir, + basedir=basedir) + + git_url = teuth_config.get_ceph_qa_suite_git_url() + # if we are running an upgrade test, and ceph-ci does not have branches like + # `jewel`, so should use ceph.git as an alternative. + try: + remote.run(logger=log.getChild(role), + args=refspec.clone(git_url, clonedir)) + except CommandFailedError: + if git_url.endswith('/ceph-ci.git'): + alt_git_url = git_url.replace('/ceph-ci.git', '/ceph.git') + elif git_url.endswith('/ceph-ci'): + alt_git_url = re.sub(r'/ceph-ci$', '/ceph.git', git_url) + else: + raise + log.info( + "failed to check out '%s' from %s; will also try in %s", + refspec, + git_url, + alt_git_url, + ) + remote.run(logger=log.getChild(role), + args=refspec.clone(alt_git_url, clonedir)) + remote.run( + logger=log.getChild(role), + args=[ + 'cd', '--', srcdir, + run.Raw('&&'), + 'if', 'test', '-e', 'Makefile', run.Raw(';'), 'then', 'make', run.Raw(';'), 'fi', + run.Raw('&&'), + 'find', '-executable', '-type', 'f', '-printf', r'%P\0'.format(srcdir=srcdir), + run.Raw('>{tdir}/workunits.list.{role}'.format(tdir=testdir, role=role)), + ], + ) + + workunits_file = '{tdir}/workunits.list.{role}'.format(tdir=testdir, role=role) + workunits = sorted(six.ensure_str(misc.get_file(remote, workunits_file)).split('\0')) + assert workunits + + try: + assert isinstance(tests, list) + for spec in tests: + log.info('Running workunits matching %s on %s...', spec, role) + prefix = '{spec}/'.format(spec=spec) + to_run = [w for w in workunits if w == spec or w.startswith(prefix)] + if not to_run: + raise RuntimeError('Spec did not match any workunits: {spec!r}'.format(spec=spec)) + for workunit in to_run: + log.info('Running workunit %s...', workunit) + args = [ + 'mkdir', '-p', '--', scratch_tmp, + run.Raw('&&'), + 'cd', '--', scratch_tmp, + run.Raw('&&'), + run.Raw('CEPH_CLI_TEST_DUP_COMMAND=1'), + run.Raw('CEPH_REF={ref}'.format(ref=refspec)), + run.Raw('TESTDIR="{tdir}"'.format(tdir=testdir)), + run.Raw('CEPH_ARGS="--cluster {0}"'.format(cluster)), + run.Raw('CEPH_ID="{id}"'.format(id=id_)), + run.Raw('PATH=$PATH:/usr/sbin'), + run.Raw('CEPH_BASE={dir}'.format(dir=clonedir)), + run.Raw('CEPH_ROOT={dir}'.format(dir=clonedir)), + ] + if env is not None: + for var, val in env.items(): + quoted_val = pipes.quote(val) + env_arg = '{var}={val}'.format(var=var, val=quoted_val) + args.append(run.Raw(env_arg)) + args.extend([ + 'adjust-ulimits', + 'ceph-coverage', + '{tdir}/archive/coverage'.format(tdir=testdir)]) + if timeout and timeout != '0': + args.extend(['timeout', timeout]) + args.extend([ + '{srcdir}/{workunit}'.format( + srcdir=srcdir, + workunit=workunit, + ), + ]) + remote.run( + logger=log.getChild(role), + args=args, + label="workunit test {workunit}".format(workunit=workunit) + ) + if cleanup: + args=['sudo', 'rm', '-rf', '--', scratch_tmp] + remote.run(logger=log.getChild(role), args=args, timeout=(60*60)) + finally: + log.info('Stopping %s on %s...', tests, role) + args=['sudo', 'rm', '-rf', '--', workunits_file, clonedir] + # N.B. don't cleanup scratch_tmp! If the mount is broken then rm will hang. + remote.run( + logger=log.getChild(role), + args=args, + ) |