summaryrefslogtreecommitdiffstats
path: root/src/ceph-crash.in
diff options
context:
space:
mode:
Diffstat (limited to 'src/ceph-crash.in')
-rwxr-xr-xsrc/ceph-crash.in147
1 files changed, 147 insertions, 0 deletions
diff --git a/src/ceph-crash.in b/src/ceph-crash.in
new file mode 100755
index 000000000..0e02837fa
--- /dev/null
+++ b/src/ceph-crash.in
@@ -0,0 +1,147 @@
+#!@Python3_EXECUTABLE@
+# -*- mode:python -*-
+# vim: ts=4 sw=4 smarttab expandtab
+
+import argparse
+import grp
+import logging
+import os
+import pwd
+import signal
+import socket
+import subprocess
+import sys
+import time
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger('ceph-crash')
+
+auth_names = ['client.crash.%s' % socket.gethostname(),
+ 'client.crash',
+ 'client.admin']
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '-p', '--path', default='/var/lib/ceph/crash',
+ help='base path to monitor for crash dumps')
+ parser.add_argument(
+ '-d', '--delay', default=10.0, type=float,
+ help='minutes to delay between scans (0 to exit after one)',
+ )
+ parser.add_argument(
+ '--name', '-n',
+ help='ceph name to authenticate as '
+ '(default: try client.crash, client.admin)')
+ parser.add_argument(
+ '--log-level', '-l',
+ help='log level output (default: INFO), support INFO or DEBUG')
+
+ return parser.parse_args()
+
+
+def post_crash(path):
+ rc = 0
+ for n in auth_names:
+ pr = subprocess.Popen(
+ args=['timeout', '30', 'ceph',
+ '-n', n,
+ 'crash', 'post', '-i', '-'],
+ stdin=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+ f = open(os.path.join(path, 'meta'), 'rb')
+ (_, stderr) = pr.communicate(input=f.read())
+ stderr = stderr.decode()
+ rc = pr.wait()
+ f.close()
+ if rc != 0 or stderr != "":
+ log.warning('post %s as %s failed: %s' % (path, n, stderr))
+ if rc == 0:
+ break
+ return rc
+
+
+def scrape_path(path):
+ for p in os.listdir(path):
+ crashpath = os.path.join(path, p)
+ if not os.access(crashpath, os.R_OK):
+ log.warning('unable to read crash path %s' % (crashpath))
+ continue
+ metapath = os.path.join(crashpath, 'meta')
+ donepath = os.path.join(crashpath, 'done')
+ if os.path.isfile(metapath):
+ if not os.path.isfile(donepath):
+ # hang out just for a bit; either we interrupted the dump
+ # or the daemon crashed before finishing it
+ time.sleep(1)
+ if not os.path.isfile(donepath):
+ return
+ # ok, we can process this one
+ rc = post_crash(crashpath)
+ if rc == 0:
+ os.rename(crashpath, os.path.join(path, 'posted/', p))
+ log.debug(
+ "posted %s and renamed %s -> %s " %
+ (metapath, p, os.path.join('posted/', p))
+ )
+
+
+def handler(signum, frame):
+ print('*** Interrupted with signal %d ***' % signum)
+ sys.exit(0)
+
+
+def drop_privs():
+ if os.getuid() == 0:
+ try:
+ ceph_uid = pwd.getpwnam("ceph").pw_uid
+ ceph_gid = grp.getgrnam("ceph").gr_gid
+ os.setgroups([])
+ os.setgid(ceph_gid)
+ os.setuid(ceph_uid)
+ except Exception as e:
+ log.error(f"Unable to drop privileges: {e}")
+ sys.exit(1)
+
+
+def main():
+ global auth_names
+
+ # run as unprivileged ceph user
+ drop_privs()
+
+ # exit code 0 on SIGINT, SIGTERM
+ signal.signal(signal.SIGINT, handler)
+ signal.signal(signal.SIGTERM, handler)
+
+ args = parse_args()
+ if args.log_level == 'DEBUG':
+ log.setLevel(logging.DEBUG)
+
+ postdir = os.path.join(args.path, 'posted')
+ if args.name:
+ auth_names = [args.name]
+
+ while not os.path.isdir(postdir):
+ log.error("directory %s does not exist; please create" % postdir)
+ time.sleep(30)
+
+ log.info("pinging cluster to exercise our key")
+ pr = subprocess.Popen(args=['timeout', '30', 'ceph', '-s'])
+ pr.wait()
+
+ log.info("monitoring path %s, delay %ds" % (args.path, args.delay * 60.0))
+ while True:
+ try:
+ scrape_path(args.path)
+ except Exception as e:
+ log.error(f"Error scraping {args.path}: {e}")
+ if args.delay == 0:
+ sys.exit(0)
+ time.sleep(args.delay * 60)
+
+
+if __name__ == "__main__":
+ main()