diff options
Diffstat (limited to 'qa/tasks/cephfs/kernel_mount.py')
-rw-r--r-- | qa/tasks/cephfs/kernel_mount.py | 260 |
1 files changed, 260 insertions, 0 deletions
diff --git a/qa/tasks/cephfs/kernel_mount.py b/qa/tasks/cephfs/kernel_mount.py new file mode 100644 index 00000000..d027bcfc --- /dev/null +++ b/qa/tasks/cephfs/kernel_mount.py @@ -0,0 +1,260 @@ +import json +import logging +import time +from textwrap import dedent +from teuthology.orchestra.run import CommandFailedError +from teuthology import misc + +from teuthology.orchestra import remote as orchestra_remote +from teuthology.orchestra import run +from teuthology.contextutil import MaxWhileTries +from tasks.cephfs.mount import CephFSMount + +log = logging.getLogger(__name__) + + +UMOUNT_TIMEOUT = 300 + + +class KernelMount(CephFSMount): + def __init__(self, ctx, test_dir, client_id, client_remote, + ipmi_user, ipmi_password, ipmi_domain): + super(KernelMount, self).__init__(ctx, test_dir, client_id, client_remote) + + self.mounted = False + self.ipmi_user = ipmi_user + self.ipmi_password = ipmi_password + self.ipmi_domain = ipmi_domain + + def mount(self, mount_path=None, mount_fs_name=None, mountpoint=None): + if mountpoint is not None: + self.mountpoint = mountpoint + self.setupfs(name=mount_fs_name) + + log.info('Mounting kclient client.{id} at {remote} {mnt}...'.format( + id=self.client_id, remote=self.client_remote, mnt=self.mountpoint)) + + self.client_remote.run(args=['mkdir', '-p', self.mountpoint], + timeout=(5*60)) + + if mount_path is None: + mount_path = "/" + + opts = 'name={id},norequire_active_mds,conf={conf}'.format(id=self.client_id, + conf=self.config_path) + + if mount_fs_name is not None: + opts += ",mds_namespace={0}".format(mount_fs_name) + + self.client_remote.run( + args=[ + 'sudo', + 'adjust-ulimits', + 'ceph-coverage', + '{tdir}/archive/coverage'.format(tdir=self.test_dir), + '/bin/mount', + '-t', + 'ceph', + ':{mount_path}'.format(mount_path=mount_path), + self.mountpoint, + '-v', + '-o', + opts + ], + timeout=(30*60), + ) + + self.client_remote.run( + args=['sudo', 'chmod', '1777', self.mountpoint], timeout=(5*60)) + + self.mounted = True + + def umount(self, force=False): + if not self.is_mounted(): + return + + log.debug('Unmounting client client.{id}...'.format(id=self.client_id)) + + cmd=['sudo', 'umount', self.mountpoint] + if force: + cmd.append('-f') + + try: + self.client_remote.run(args=cmd, timeout=(15*60)) + except Exception as e: + self.client_remote.run(args=[ + 'sudo', + run.Raw('PATH=/usr/sbin:$PATH'), + 'lsof', + run.Raw(';'), + 'ps', 'auxf', + ], timeout=(15*60)) + raise e + + rproc = self.client_remote.run( + args=[ + 'rmdir', + '--', + self.mountpoint, + ], + wait=False + ) + run.wait([rproc], UMOUNT_TIMEOUT) + self.mounted = False + + def cleanup(self): + pass + + def umount_wait(self, force=False, require_clean=False, timeout=900): + """ + Unlike the fuse client, the kernel client's umount is immediate + """ + if not self.is_mounted(): + return + + try: + self.umount(force) + except (CommandFailedError, MaxWhileTries): + if not force: + raise + + self.kill() + self.kill_cleanup() + + self.mounted = False + + def is_mounted(self): + return self.mounted + + def wait_until_mounted(self): + """ + Unlike the fuse client, the kernel client is up and running as soon + as the initial mount() function returns. + """ + assert self.mounted + + def teardown(self): + super(KernelMount, self).teardown() + if self.mounted: + self.umount() + + def kill(self): + """ + The Ceph kernel client doesn't have a mechanism to kill itself (doing + that in side the kernel would be weird anyway), so we reboot the whole node + to get the same effect. + + We use IPMI to reboot, because we don't want the client to send any + releases of capabilities. + """ + + con = orchestra_remote.getRemoteConsole(self.client_remote.hostname, + self.ipmi_user, + self.ipmi_password, + self.ipmi_domain) + con.hard_reset(wait_for_login=False) + + self.mounted = False + + def kill_cleanup(self): + assert not self.mounted + + # We need to do a sleep here because we don't know how long it will + # take for a hard_reset to be effected. + time.sleep(30) + + try: + # Wait for node to come back up after reboot + misc.reconnect(None, 300, [self.client_remote]) + except: + # attempt to get some useful debug output: + con = orchestra_remote.getRemoteConsole(self.client_remote.hostname, + self.ipmi_user, + self.ipmi_password, + self.ipmi_domain) + con.check_status(timeout=60) + raise + + # Remove mount directory + self.client_remote.run(args=['uptime'], timeout=10) + + # Remove mount directory + self.client_remote.run( + args=[ + 'rmdir', + '--', + self.mountpoint, + ], + timeout=(5*60), + check_status=False, + ) + + def _find_debug_dir(self): + """ + Find the debugfs folder for this mount + """ + pyscript = dedent(""" + import glob + import os + import json + + def get_id_to_dir(): + result = {} + for dir in glob.glob("/sys/kernel/debug/ceph/*"): + mds_sessions_lines = open(os.path.join(dir, "mds_sessions")).readlines() + client_id = mds_sessions_lines[1].split()[1].strip('"') + + result[client_id] = dir + return result + + print(json.dumps(get_id_to_dir())) + """) + + output = self.client_remote.sh([ + 'sudo', 'python3', '-c', pyscript + ], timeout=(5*60)) + client_id_to_dir = json.loads(output) + + try: + return client_id_to_dir[self.client_id] + except KeyError: + log.error("Client id '{0}' debug dir not found (clients seen were: {1})".format( + self.client_id, ",".join(client_id_to_dir.keys()) + )) + raise + + def _read_debug_file(self, filename): + debug_dir = self._find_debug_dir() + + pyscript = dedent(""" + import os + + print(open(os.path.join("{debug_dir}", "{filename}")).read()) + """).format(debug_dir=debug_dir, filename=filename) + + output = self.client_remote.sh([ + 'sudo', 'python3', '-c', pyscript + ], timeout=(5*60)) + return output + + def get_global_id(self): + """ + Look up the CephFS client ID for this mount, using debugfs. + """ + + assert self.mounted + + mds_sessions = self._read_debug_file("mds_sessions") + lines = mds_sessions.split("\n") + return int(lines[0].split()[1]) + + def get_osd_epoch(self): + """ + Return 2-tuple of osd_epoch, osd_epoch_barrier + """ + osd_map = self._read_debug_file("osdmap") + lines = osd_map.split("\n") + first_line_tokens = lines[0].split() + epoch, barrier = int(first_line_tokens[1]), int(first_line_tokens[3]) + + return epoch, barrier |