diff options
Diffstat (limited to 'src/ceph-crash.in')
-rwxr-xr-x | src/ceph-crash.in | 147 |
1 files changed, 147 insertions, 0 deletions
diff --git a/src/ceph-crash.in b/src/ceph-crash.in new file mode 100755 index 000000000..0e02837fa --- /dev/null +++ b/src/ceph-crash.in @@ -0,0 +1,147 @@ +#!@Python3_EXECUTABLE@ +# -*- mode:python -*- +# vim: ts=4 sw=4 smarttab expandtab + +import argparse +import grp +import logging +import os +import pwd +import signal +import socket +import subprocess +import sys +import time + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger('ceph-crash') + +auth_names = ['client.crash.%s' % socket.gethostname(), + 'client.crash', + 'client.admin'] + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + '-p', '--path', default='/var/lib/ceph/crash', + help='base path to monitor for crash dumps') + parser.add_argument( + '-d', '--delay', default=10.0, type=float, + help='minutes to delay between scans (0 to exit after one)', + ) + parser.add_argument( + '--name', '-n', + help='ceph name to authenticate as ' + '(default: try client.crash, client.admin)') + parser.add_argument( + '--log-level', '-l', + help='log level output (default: INFO), support INFO or DEBUG') + + return parser.parse_args() + + +def post_crash(path): + rc = 0 + for n in auth_names: + pr = subprocess.Popen( + args=['timeout', '30', 'ceph', + '-n', n, + 'crash', 'post', '-i', '-'], + stdin=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + f = open(os.path.join(path, 'meta'), 'rb') + (_, stderr) = pr.communicate(input=f.read()) + stderr = stderr.decode() + rc = pr.wait() + f.close() + if rc != 0 or stderr != "": + log.warning('post %s as %s failed: %s' % (path, n, stderr)) + if rc == 0: + break + return rc + + +def scrape_path(path): + for p in os.listdir(path): + crashpath = os.path.join(path, p) + if not os.access(crashpath, os.R_OK): + log.warning('unable to read crash path %s' % (crashpath)) + continue + metapath = os.path.join(crashpath, 'meta') + donepath = os.path.join(crashpath, 'done') + if os.path.isfile(metapath): + if not os.path.isfile(donepath): + # hang out just for a bit; either we interrupted the dump + # or the daemon crashed before finishing it + time.sleep(1) + if not os.path.isfile(donepath): + return + # ok, we can process this one + rc = post_crash(crashpath) + if rc == 0: + os.rename(crashpath, os.path.join(path, 'posted/', p)) + log.debug( + "posted %s and renamed %s -> %s " % + (metapath, p, os.path.join('posted/', p)) + ) + + +def handler(signum, frame): + print('*** Interrupted with signal %d ***' % signum) + sys.exit(0) + + +def drop_privs(): + if os.getuid() == 0: + try: + ceph_uid = pwd.getpwnam("ceph").pw_uid + ceph_gid = grp.getgrnam("ceph").gr_gid + os.setgroups([]) + os.setgid(ceph_gid) + os.setuid(ceph_uid) + except Exception as e: + log.error(f"Unable to drop privileges: {e}") + sys.exit(1) + + +def main(): + global auth_names + + # run as unprivileged ceph user + drop_privs() + + # exit code 0 on SIGINT, SIGTERM + signal.signal(signal.SIGINT, handler) + signal.signal(signal.SIGTERM, handler) + + args = parse_args() + if args.log_level == 'DEBUG': + log.setLevel(logging.DEBUG) + + postdir = os.path.join(args.path, 'posted') + if args.name: + auth_names = [args.name] + + while not os.path.isdir(postdir): + log.error("directory %s does not exist; please create" % postdir) + time.sleep(30) + + log.info("pinging cluster to exercise our key") + pr = subprocess.Popen(args=['timeout', '30', 'ceph', '-s']) + pr.wait() + + log.info("monitoring path %s, delay %ds" % (args.path, args.delay * 60.0)) + while True: + try: + scrape_path(args.path) + except Exception as e: + log.error(f"Error scraping {args.path}: {e}") + if args.delay == 0: + sys.exit(0) + time.sleep(args.delay * 60) + + +if __name__ == "__main__": + main() |