summaryrefslogtreecommitdiffstats
path: root/src/ceph-crash.in
blob: 0e02837fadd4dde8abd66985b485836402e10a37 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!@Python3_EXECUTABLE@
# -*- mode:python -*-
# vim: ts=4 sw=4 smarttab expandtab

import argparse
import grp
import logging
import os
import pwd
import signal
import socket
import subprocess
import sys
import time

logging.basicConfig(level=logging.INFO)
log = logging.getLogger('ceph-crash')

auth_names = ['client.crash.%s' % socket.gethostname(),
              'client.crash',
              'client.admin']


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-p', '--path', default='/var/lib/ceph/crash',
        help='base path to monitor for crash dumps')
    parser.add_argument(
        '-d', '--delay', default=10.0, type=float,
        help='minutes to delay between scans (0 to exit after one)',
    )
    parser.add_argument(
        '--name', '-n',
        help='ceph name to authenticate as '
             '(default: try client.crash, client.admin)')
    parser.add_argument(
        '--log-level', '-l',
        help='log level output (default: INFO), support INFO or DEBUG')

    return parser.parse_args()


def post_crash(path):
    rc = 0
    for n in auth_names:
        pr = subprocess.Popen(
            args=['timeout', '30', 'ceph',
                  '-n', n,
                  'crash', 'post', '-i', '-'],
            stdin=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        f = open(os.path.join(path, 'meta'), 'rb')
        (_, stderr) = pr.communicate(input=f.read())
        stderr = stderr.decode()
        rc = pr.wait()
        f.close()
        if rc != 0 or stderr != "":
            log.warning('post %s as %s failed: %s' % (path, n, stderr))
        if rc == 0:
            break
    return rc


def scrape_path(path):
    for p in os.listdir(path):
        crashpath = os.path.join(path, p)
        if not os.access(crashpath, os.R_OK):
            log.warning('unable to read crash path %s' % (crashpath))
            continue
        metapath = os.path.join(crashpath, 'meta')
        donepath = os.path.join(crashpath, 'done')
        if os.path.isfile(metapath):
            if not os.path.isfile(donepath):
                # hang out just for a bit; either we interrupted the dump
                # or the daemon crashed before finishing it
                time.sleep(1)
                if not os.path.isfile(donepath):
                    return
            # ok, we can process this one
            rc = post_crash(crashpath)
            if rc == 0:
                os.rename(crashpath, os.path.join(path, 'posted/', p))
                log.debug(
                    "posted %s and renamed %s -> %s " %
                    (metapath, p, os.path.join('posted/', p))
                )


def handler(signum, frame):
    print('*** Interrupted with signal %d ***' % signum)
    sys.exit(0)


def drop_privs():
    if os.getuid() == 0:
        try:
            ceph_uid = pwd.getpwnam("ceph").pw_uid
            ceph_gid = grp.getgrnam("ceph").gr_gid
            os.setgroups([])
            os.setgid(ceph_gid)
            os.setuid(ceph_uid)
        except Exception as e:
            log.error(f"Unable to drop privileges: {e}")
            sys.exit(1)


def main():
    global auth_names

    # run as unprivileged ceph user
    drop_privs()

    # exit code 0 on SIGINT, SIGTERM
    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    args = parse_args()
    if args.log_level == 'DEBUG':
        log.setLevel(logging.DEBUG)

    postdir = os.path.join(args.path, 'posted')
    if args.name:
        auth_names = [args.name]

    while not os.path.isdir(postdir):
        log.error("directory %s does not exist; please create" % postdir)
        time.sleep(30)

    log.info("pinging cluster to exercise our key")
    pr = subprocess.Popen(args=['timeout', '30', 'ceph', '-s'])
    pr.wait()

    log.info("monitoring path %s, delay %ds" % (args.path, args.delay * 60.0))
    while True:
        try:
            scrape_path(args.path)
        except Exception as e:
            log.error(f"Error scraping {args.path}: {e}")
        if args.delay == 0:
            sys.exit(0)
        time.sleep(args.delay * 60)


if __name__ == "__main__":
    main()