#!@Python3_EXECUTABLE@ # -*- mode:python -*- # vim: ts=4 sw=4 smarttab expandtab import argparse import grp import logging import os import pwd import signal import socket import subprocess import sys import time logging.basicConfig(level=logging.INFO) log = logging.getLogger('ceph-crash') auth_names = ['client.crash.%s' % socket.gethostname(), 'client.crash', 'client.admin'] def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( '-p', '--path', default='/var/lib/ceph/crash', help='base path to monitor for crash dumps') parser.add_argument( '-d', '--delay', default=10.0, type=float, help='minutes to delay between scans (0 to exit after one)', ) parser.add_argument( '--name', '-n', help='ceph name to authenticate as ' '(default: try client.crash, client.admin)') parser.add_argument( '--log-level', '-l', help='log level output (default: INFO), support INFO or DEBUG') return parser.parse_args() def post_crash(path): rc = 0 for n in auth_names: pr = subprocess.Popen( args=['timeout', '30', 'ceph', '-n', n, 'crash', 'post', '-i', '-'], stdin=subprocess.PIPE, stderr=subprocess.PIPE, ) f = open(os.path.join(path, 'meta'), 'rb') (_, stderr) = pr.communicate(input=f.read()) stderr = stderr.decode() rc = pr.wait() f.close() if rc != 0 or stderr != "": log.warning('post %s as %s failed: %s' % (path, n, stderr)) if rc == 0: break return rc def scrape_path(path): for p in os.listdir(path): crashpath = os.path.join(path, p) if not os.access(crashpath, os.R_OK): log.warning('unable to read crash path %s' % (crashpath)) continue metapath = os.path.join(crashpath, 'meta') donepath = os.path.join(crashpath, 'done') if os.path.isfile(metapath): if not os.path.isfile(donepath): # hang out just for a bit; either we interrupted the dump # or the daemon crashed before finishing it time.sleep(1) if not os.path.isfile(donepath): return # ok, we can process this one rc = post_crash(crashpath) if rc == 0: os.rename(crashpath, os.path.join(path, 'posted/', p)) log.debug( "posted %s and renamed %s -> %s " % (metapath, p, os.path.join('posted/', p)) ) def handler(signum, frame): print('*** Interrupted with signal %d ***' % signum) sys.exit(0) def drop_privs(): if os.getuid() == 0: try: ceph_uid = pwd.getpwnam("ceph").pw_uid ceph_gid = grp.getgrnam("ceph").gr_gid os.setgroups([]) os.setgid(ceph_gid) os.setuid(ceph_uid) except Exception as e: log.error(f"Unable to drop privileges: {e}") sys.exit(1) def main(): global auth_names # run as unprivileged ceph user drop_privs() # exit code 0 on SIGINT, SIGTERM signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) args = parse_args() if args.log_level == 'DEBUG': log.setLevel(logging.DEBUG) postdir = os.path.join(args.path, 'posted') if args.name: auth_names = [args.name] while not os.path.isdir(postdir): log.error("directory %s does not exist; please create" % postdir) time.sleep(30) log.info("pinging cluster to exercise our key") pr = subprocess.Popen(args=['timeout', '30', 'ceph', '-s']) pr.wait() log.info("monitoring path %s, delay %ds" % (args.path, args.delay * 60.0)) while True: try: scrape_path(args.path) except Exception as e: log.error(f"Error scraping {args.path}: {e}") if args.delay == 0: sys.exit(0) time.sleep(args.delay * 60) if __name__ == "__main__": main()