diff options
Diffstat (limited to 'qa/tasks/cephfs/fuse_mount.py')
-rw-r--r-- | qa/tasks/cephfs/fuse_mount.py | 502 |
1 files changed, 502 insertions, 0 deletions
diff --git a/qa/tasks/cephfs/fuse_mount.py b/qa/tasks/cephfs/fuse_mount.py new file mode 100644 index 00000000..664de4f4 --- /dev/null +++ b/qa/tasks/cephfs/fuse_mount.py @@ -0,0 +1,502 @@ +from io import StringIO +import json +import time +import logging + +import six + +from textwrap import dedent + +from teuthology import misc +from teuthology.contextutil import MaxWhileTries +from teuthology.orchestra import run +from teuthology.orchestra.run import CommandFailedError +from tasks.cephfs.mount import CephFSMount + +log = logging.getLogger(__name__) + + +class FuseMount(CephFSMount): + def __init__(self, ctx, client_config, test_dir, client_id, client_remote): + super(FuseMount, self).__init__(ctx, test_dir, client_id, client_remote) + + self.client_config = client_config if client_config else {} + self.fuse_daemon = None + self._fuse_conn = None + self.id = None + self.inst = None + self.addr = None + + def mount(self, mount_path=None, mount_fs_name=None, mountpoint=None): + if mountpoint is not None: + self.mountpoint = mountpoint + self.setupfs(name=mount_fs_name) + + try: + return self._mount(mount_path, mount_fs_name) + except RuntimeError: + # Catch exceptions by the mount() logic (i.e. not remote command + # failures) and ensure the mount is not left half-up. + # Otherwise we might leave a zombie mount point that causes + # anyone traversing cephtest/ to get hung up on. + log.warning("Trying to clean up after failed mount") + self.umount_wait(force=True) + raise + + def _mount(self, mount_path, mount_fs_name): + log.info("Client client.%s config is %s" % (self.client_id, self.client_config)) + + daemon_signal = 'kill' + if self.client_config.get('coverage') or self.client_config.get('valgrind') is not None: + daemon_signal = 'term' + + log.info('Mounting ceph-fuse client.{id} at {remote} {mnt}...'.format( + id=self.client_id, remote=self.client_remote, mnt=self.mountpoint)) + + self.client_remote.run(args=['mkdir', '-p', self.mountpoint], + timeout=(15*60), cwd=self.test_dir) + + run_cmd = [ + 'sudo', + 'adjust-ulimits', + 'ceph-coverage', + '{tdir}/archive/coverage'.format(tdir=self.test_dir), + 'daemon-helper', + daemon_signal, + ] + + fuse_cmd = ['ceph-fuse', "-f"] + + if mount_path is not None: + fuse_cmd += ["--client_mountpoint={0}".format(mount_path)] + + if mount_fs_name is not None: + fuse_cmd += ["--client_mds_namespace={0}".format(mount_fs_name)] + + fuse_cmd += [ + '--name', 'client.{id}'.format(id=self.client_id), + # TODO ceph-fuse doesn't understand dash dash '--', + self.mountpoint, + ] + + cwd = self.test_dir + if self.client_config.get('valgrind') is not None: + run_cmd = misc.get_valgrind_args( + self.test_dir, + 'client.{id}'.format(id=self.client_id), + run_cmd, + self.client_config.get('valgrind'), + ) + cwd = None # misc.get_valgrind_args chdir for us + + run_cmd.extend(fuse_cmd) + + def list_connections(): + from teuthology.misc import get_system_type + + conn_dir = "/sys/fs/fuse/connections" + + self.client_remote.run(args=['sudo', 'modprobe', 'fuse'], + check_status=False) + self.client_remote.run( + args=["sudo", "mount", "-t", "fusectl", conn_dir, conn_dir], + check_status=False, timeout=(30)) + + try: + ls_str = self.client_remote.sh("ls " + conn_dir, + stdout=StringIO(), + timeout=(15*60)).strip() + except CommandFailedError: + return [] + + if ls_str: + return [int(n) for n in ls_str.split("\n")] + else: + return [] + + # Before starting ceph-fuse process, note the contents of + # /sys/fs/fuse/connections + pre_mount_conns = list_connections() + log.info("Pre-mount connections: {0}".format(pre_mount_conns)) + + proc = self.client_remote.run( + args=run_cmd, + cwd=cwd, + logger=log.getChild('ceph-fuse.{id}'.format(id=self.client_id)), + stdin=run.PIPE, + wait=False, + ) + self.fuse_daemon = proc + + # Wait for the connection reference to appear in /sys + mount_wait = self.client_config.get('mount_wait', 0) + if mount_wait > 0: + log.info("Fuse mount waits {0} seconds before checking /sys/".format(mount_wait)) + time.sleep(mount_wait) + timeout = int(self.client_config.get('mount_timeout', 30)) + waited = 0 + + post_mount_conns = list_connections() + while len(post_mount_conns) <= len(pre_mount_conns): + if self.fuse_daemon.finished: + # Did mount fail? Raise the CommandFailedError instead of + # hitting the "failed to populate /sys/" timeout + self.fuse_daemon.wait() + time.sleep(1) + waited += 1 + if waited > timeout: + raise RuntimeError("Fuse mount failed to populate /sys/ after {0} seconds".format( + waited + )) + else: + post_mount_conns = list_connections() + + log.info("Post-mount connections: {0}".format(post_mount_conns)) + + # Record our fuse connection number so that we can use it when + # forcing an unmount + new_conns = list(set(post_mount_conns) - set(pre_mount_conns)) + if len(new_conns) == 0: + raise RuntimeError("New fuse connection directory not found ({0})".format(new_conns)) + elif len(new_conns) > 1: + raise RuntimeError("Unexpectedly numerous fuse connections {0}".format(new_conns)) + else: + self._fuse_conn = new_conns[0] + + self.gather_mount_info() + + def gather_mount_info(self): + status = self.admin_socket(['status']) + self.id = status['id'] + self.client_pid = status['metadata']['pid'] + try: + self.inst = status['inst_str'] + self.addr = status['addr_str'] + except KeyError: + sessions = self.fs.rank_asok(['session', 'ls']) + for s in sessions: + if s['id'] == self.id: + self.inst = s['inst'] + self.addr = self.inst.split()[1] + if self.inst is None: + raise RuntimeError("cannot find client session") + + def is_mounted(self): + proc = self.client_remote.run( + args=[ + 'stat', + '--file-system', + '--printf=%T\n', + '--', + self.mountpoint, + ], + cwd=self.test_dir, + stdout=StringIO(), + stderr=StringIO(), + wait=False, + timeout=(15*60) + ) + try: + proc.wait() + except CommandFailedError: + error = proc.stderr.getvalue() + if ("endpoint is not connected" in error + or "Software caused connection abort" in error): + # This happens is fuse is killed without unmount + log.warning("Found stale moutn point at {0}".format(self.mountpoint)) + return True + else: + # This happens if the mount directory doesn't exist + log.info('mount point does not exist: %s', self.mountpoint) + return False + + fstype = six.ensure_str(proc.stdout.getvalue()).rstrip('\n') + if fstype == 'fuseblk': + log.info('ceph-fuse is mounted on %s', self.mountpoint) + return True + else: + log.debug('ceph-fuse not mounted, got fs type {fstype!r}'.format( + fstype=fstype)) + return False + + def wait_until_mounted(self): + """ + Check to make sure that fuse is mounted on mountpoint. If not, + sleep for 5 seconds and check again. + """ + + while not self.is_mounted(): + # Even if it's not mounted, it should at least + # be running: catch simple failures where it has terminated. + assert not self.fuse_daemon.poll() + + time.sleep(5) + + # Now that we're mounted, set permissions so that the rest of the test will have + # unrestricted access to the filesystem mount. + try: + stderr = StringIO() + self.client_remote.run(args=['sudo', 'chmod', '1777', self.mountpoint], timeout=(15*60), cwd=self.test_dir, stderr=stderr) + except run.CommandFailedError: + stderr = stderr.getvalue() + if "Read-only file system".lower() in stderr.lower(): + pass + else: + raise + + def _mountpoint_exists(self): + return self.client_remote.run(args=["ls", "-d", self.mountpoint], check_status=False, cwd=self.test_dir, timeout=(15*60)).exitstatus == 0 + + def umount(self): + if not self.is_mounted(): + return + + try: + log.info('Running fusermount -u on {name}...'.format(name=self.client_remote.name)) + self.client_remote.run( + args=[ + 'sudo', + 'fusermount', + '-u', + self.mountpoint, + ], + cwd=self.test_dir, + timeout=(30*60), + ) + except run.CommandFailedError: + log.info('Failed to unmount ceph-fuse on {name}, aborting...'.format(name=self.client_remote.name)) + + self.client_remote.run(args=[ + 'sudo', + run.Raw('PATH=/usr/sbin:$PATH'), + 'lsof', + run.Raw(';'), + 'ps', + 'auxf', + ], timeout=(60*15)) + + # abort the fuse mount, killing all hung processes + if self._fuse_conn: + self.run_python(dedent(""" + import os + path = "/sys/fs/fuse/connections/{0}/abort" + if os.path.exists(path): + open(path, "w").write("1") + """).format(self._fuse_conn)) + self._fuse_conn = None + + stderr = StringIO() + try: + # make sure its unmounted + self.client_remote.run( + args=[ + 'sudo', + 'umount', + '-l', + '-f', + self.mountpoint, + ], + stderr=stderr, + timeout=(60*15) + ) + except CommandFailedError: + if self.is_mounted(): + raise + + assert not self.is_mounted() + self._fuse_conn = None + self.id = None + self.inst = None + self.addr = None + + def umount_wait(self, force=False, require_clean=False, timeout=900): + """ + :param force: Complete cleanly even if the MDS is offline + """ + if force: + assert not require_clean # mutually exclusive + + # When we expect to be forcing, kill the ceph-fuse process directly. + # This should avoid hitting the more aggressive fallback killing + # in umount() which can affect other mounts too. + self.fuse_daemon.stdin.close() + + # However, we will still hit the aggressive wait if there is an ongoing + # mount -o remount (especially if the remount is stuck because MDSs + # are unavailable) + + self.umount() + + try: + if self.fuse_daemon: + # Permit a timeout, so that we do not block forever + run.wait([self.fuse_daemon], timeout) + except MaxWhileTries: + log.error("process failed to terminate after unmount. This probably" + " indicates a bug within ceph-fuse.") + raise + except CommandFailedError: + if require_clean: + raise + + self.cleanup() + + def cleanup(self): + """ + Remove the mount point. + + Prerequisite: the client is not mounted. + """ + stderr = StringIO() + try: + self.client_remote.run( + args=[ + 'rmdir', + '--', + self.mountpoint, + ], + cwd=self.test_dir, + stderr=stderr, + timeout=(60*5), + check_status=False, + ) + except CommandFailedError: + if "No such file or directory" in stderr.getvalue(): + pass + else: + raise + + def kill(self): + """ + Terminate the client without removing the mount point. + """ + log.info('Killing ceph-fuse connection on {name}...'.format(name=self.client_remote.name)) + self.fuse_daemon.stdin.close() + try: + self.fuse_daemon.wait() + except CommandFailedError: + pass + + def kill_cleanup(self): + """ + Follow up ``kill`` to get to a clean unmounted state. + """ + log.info('Cleaning up killed ceph-fuse connection') + self.umount() + self.cleanup() + + def teardown(self): + """ + Whatever the state of the mount, get it gone. + """ + super(FuseMount, self).teardown() + + self.umount() + + if self.fuse_daemon and not self.fuse_daemon.finished: + self.fuse_daemon.stdin.close() + try: + self.fuse_daemon.wait() + except CommandFailedError: + pass + + # Indiscriminate, unlike the touchier cleanup() + self.client_remote.run( + args=[ + 'rm', + '-rf', + self.mountpoint, + ], + cwd=self.test_dir, + timeout=(60*5) + ) + + def _asok_path(self): + return "/var/run/ceph/ceph-client.{0}.*.asok".format(self.client_id) + + @property + def _prefix(self): + return "" + + def admin_socket(self, args): + pyscript = """ +import glob +import re +import os +import subprocess + +def find_socket(client_name): + asok_path = "{asok_path}" + files = glob.glob(asok_path) + + # Given a non-glob path, it better be there + if "*" not in asok_path: + assert(len(files) == 1) + return files[0] + + for f in files: + pid = re.match(".*\.(\d+)\.asok$", f).group(1) + if os.path.exists("/proc/{{0}}".format(pid)): + return f + raise RuntimeError("Client socket {{0}} not found".format(client_name)) + +print(find_socket("{client_name}")) +""".format( + asok_path=self._asok_path(), + client_name="client.{0}".format(self.client_id)) + + # Find the admin socket + asok_path = self.client_remote.sh( + ['sudo', 'python3', '-c', pyscript], + stdout=StringIO(), + timeout=(15*60)).strip() + log.info("Found client admin socket at {0}".format(asok_path)) + + # Query client ID from admin socket + json_data = self.client_remote.sh( + ['sudo', self._prefix + 'ceph', '--admin-daemon', asok_path] + args, + stdout=StringIO(), + timeout=(15*60)) + return json.loads(json_data) + + def get_global_id(self): + """ + Look up the CephFS client ID for this mount + """ + return self.admin_socket(['mds_sessions'])['id'] + + def get_global_inst(self): + """ + Look up the CephFS client instance for this mount + """ + return self.inst + + def get_global_addr(self): + """ + Look up the CephFS client addr for this mount + """ + return self.addr + + def get_client_pid(self): + """ + return pid of ceph-fuse process + """ + status = self.admin_socket(['status']) + return status['metadata']['pid'] + + def get_osd_epoch(self): + """ + Return 2-tuple of osd_epoch, osd_epoch_barrier + """ + status = self.admin_socket(['status']) + return status['osd_epoch'], status['osd_epoch_barrier'] + + def get_dentry_count(self): + """ + Return 2-tuple of dentry_count, dentry_pinned_count + """ + status = self.admin_socket(['status']) + return status['dentry_count'], status['dentry_pinned_count'] + + def set_cache_size(self, size): + return self.admin_socket(['config', 'set', 'client_cache_size', str(size)]) |