summaryrefslogtreecommitdiffstats
path: root/src/util/watchdog.c
blob: 3ec1fbc5f0b4bf631ff420567414c973bdcbb7a6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
/*++
/* NAME
/*	watchdog 3
/* SUMMARY
/*	watchdog timer
/* SYNOPSIS
/*	#include <watchdog.h>
/*
/*	WATCHDOG *watchdog_create(timeout, action, context)
/*	unsigned timeout;
/*	void	(*action)(WATCHDOG *watchdog, char *context);
/*	char	*context;
/*
/*	void	watchdog_start(watchdog)
/*	WATCHDOG *watchdog;
/*
/*	void	watchdog_stop(watchdog)
/*	WATCHDOG *watchdog;
/*
/*	void	watchdog_destroy(watchdog)
/*	WATCHDOG *watchdog;
/*
/*	void	watchdog_pat()
/* DESCRIPTION
/*	This module implements watchdog timers that are based on ugly
/*	UNIX alarm timers. The module is designed to survive systems
/*	with clocks that jump occasionally.
/*
/*	Watchdog timers can be stacked. Only one watchdog timer can be
/*	active at a time. Only the last created watchdog timer can be
/*	manipulated. Watchdog timers must be destroyed in reverse order
/*	of creation.
/*
/*	watchdog_create() suspends the current watchdog timer, if any,
/*	and instantiates a new watchdog timer.
/*
/*	watchdog_start() starts or restarts the watchdog timer.
/*
/*	watchdog_stop() stops the watchdog timer.
/*
/*	watchdog_destroy() stops the watchdog timer, and resumes the
/*	watchdog timer instance that was suspended by watchdog_create().
/*
/*	watchdog_pat() pats the watchdog, so it stays quiet.
/*
/*	Arguments:
/* .IP timeout
/*	The watchdog time limit. When the watchdog timer runs, the
/*	process must invoke watchdog_start(), watchdog_stop() or
/*	watchdog_destroy() before the time limit is reached.
/* .IP action
/*	A null pointer, or pointer to function that is called when the
/*	watchdog alarm goes off. The default action is to terminate
/*	the process with a fatal error.
/* .IP context
/*	Application context that is passed to the action routine.
/* .IP watchdog
/*	Must be a pointer to the most recently created watchdog instance.
/*	This argument is checked upon each call.
/* BUGS
/*	UNIX alarm timers are not stackable, so there can be at most one
/*	watchdog instance active at any given time.
/* SEE ALSO
/*	msg(3) diagnostics interface
/* DIAGNOSTICS
/*	Fatal errors: memory allocation problem, system call failure.
/*	Panics: interface violations.
/* LICENSE
/* .ad
/* .fi
/*	The Secure Mailer license must be distributed with this software.
/* AUTHOR(S)
/*	Wietse Venema
/*	IBM T.J. Watson Research
/*	P.O. Box 704
/*	Yorktown Heights, NY 10598, USA
/*
/*	Wietse Venema
/*	Google, Inc.
/*	111 8th Avenue
/*	New York, NY 10011, USA
/*--*/

/* System library. */

#include <sys_defs.h>
#include <unistd.h>
#include <signal.h>
#include <posix_signals.h>

/* Utility library. */

#include <msg.h>
#include <mymalloc.h>
#include <killme_after.h>
#include <watchdog.h>

/* Application-specific. */

 /*
  * Rather than having one timer that goes off when it is too late, we break
  * up the time limit into smaller intervals so that we can deal with clocks
  * that jump occasionally.
  */
#define WATCHDOG_STEPS	3

 /*
  * UNIX alarms are not stackable, but we can save and restore state, so that
  * watchdogs can at least be nested, sort of.
  */
struct WATCHDOG {
    unsigned timeout;			/* our time resolution */
    WATCHDOG_FN action;			/* application routine */
    char   *context;			/* application context */
    int     trip_run;			/* number of successive timeouts */
    WATCHDOG *saved_watchdog;		/* saved state */
    struct sigaction saved_action;	/* saved state */
    unsigned saved_time;		/* saved state */
};

 /*
  * However, only one watchdog instance can be current, and the caller has to
  * restore state before a prior watchdog instance can be manipulated.
  */
static WATCHDOG *watchdog_curr;

 /*
  * Workaround for systems where the alarm signal does not wakeup the event
  * machinery, and therefore does not restart the watchdog timer in the
  * single_server etc. skeletons. The symptom is that programs abort when the
  * watchdog timeout is less than the max_idle time.
  */
#ifdef USE_WATCHDOG_PIPE
#include <errno.h>
#include <iostuff.h>
#include <events.h>

static int watchdog_pipe[2];

/* watchdog_read - read event pipe */

static void watchdog_read(int unused_event, void *unused_context)
{
    char    ch;

    while (read(watchdog_pipe[0], &ch, 1) > 0)
	 /* void */ ;
}

#endif					/* USE_WATCHDOG_PIPE */

/* watchdog_event - handle timeout event */

static void watchdog_event(int unused_sig)
{
    const char *myname = "watchdog_event";
    WATCHDOG *wp;

    /*
     * This routine runs as a signal handler. We should not do anything that
     * could involve memory allocation/deallocation, but exiting without
     * proper explanation would be unacceptable. For this reason, msg(3) was
     * made safe for usage by signal handlers that terminate the process.
     */
    if ((wp = watchdog_curr) == 0)
	msg_panic("%s: no instance", myname);
    if (msg_verbose > 1)
	msg_info("%s: %p %d", myname, (void *) wp, wp->trip_run);
    if (++(wp->trip_run) < WATCHDOG_STEPS) {
#ifdef USE_WATCHDOG_PIPE
	int     saved_errno = errno;

	/* Wake up the events(3) engine. */
	if (write(watchdog_pipe[1], "", 1) != 1)
	    msg_warn("%s: write watchdog_pipe: %m", myname);
	errno = saved_errno;
#endif
	alarm(wp->timeout);
    } else {
	if (wp->action)
	    wp->action(wp, wp->context);
	else {
	    killme_after(5);
#ifdef TEST
	    pause();
#endif
	    msg_fatal("watchdog timeout");
	}
    }
}

/* watchdog_create - create watchdog instance */

WATCHDOG *watchdog_create(unsigned timeout, WATCHDOG_FN action, char *context)
{
    const char *myname = "watchdog_create";
    struct sigaction sig_action;
    WATCHDOG *wp;

    wp = (WATCHDOG *) mymalloc(sizeof(*wp));
    if ((wp->timeout = timeout / WATCHDOG_STEPS) == 0)
	msg_panic("%s: timeout %d is too small", myname, timeout);
    wp->action = action;
    wp->context = context;
    wp->saved_watchdog = watchdog_curr;
    wp->saved_time = alarm(0);
    sigemptyset(&sig_action.sa_mask);
#ifdef SA_RESTART
    sig_action.sa_flags = SA_RESTART;
#else
    sig_action.sa_flags = 0;
#endif
    sig_action.sa_handler = watchdog_event;
    if (sigaction(SIGALRM, &sig_action, &wp->saved_action) < 0)
	msg_fatal("%s: sigaction(SIGALRM): %m", myname);
    if (msg_verbose > 1)
	msg_info("%s: %p %d", myname, (void *) wp, timeout);
#ifdef USE_WATCHDOG_PIPE
    if (watchdog_curr == 0) {
	if (pipe(watchdog_pipe) < 0)
	    msg_fatal("%s: pipe: %m", myname);
	non_blocking(watchdog_pipe[0], NON_BLOCKING);
	non_blocking(watchdog_pipe[1], NON_BLOCKING);
	close_on_exec(watchdog_pipe[0], CLOSE_ON_EXEC);	/* Fix 20190126 */
	close_on_exec(watchdog_pipe[1], CLOSE_ON_EXEC);	/* Fix 20190126 */
	event_enable_read(watchdog_pipe[0], watchdog_read, (void *) 0);
    }
#endif
    return (watchdog_curr = wp);
}

/* watchdog_destroy - destroy watchdog instance, restore state */

void    watchdog_destroy(WATCHDOG *wp)
{
    const char *myname = "watchdog_destroy";

    watchdog_stop(wp);
    watchdog_curr = wp->saved_watchdog;
    if (sigaction(SIGALRM, &wp->saved_action, (struct sigaction *) 0) < 0)
	msg_fatal("%s: sigaction(SIGALRM): %m", myname);
    if (wp->saved_time)
	alarm(wp->saved_time);
    myfree((void *) wp);
#ifdef USE_WATCHDOG_PIPE
    if (watchdog_curr == 0) {
	event_disable_readwrite(watchdog_pipe[0]);
	(void) close(watchdog_pipe[0]);
	(void) close(watchdog_pipe[1]);
    }
#endif
    if (msg_verbose > 1)
	msg_info("%s: %p", myname, (void *) wp);
}

/* watchdog_start - enable watchdog timer */

void    watchdog_start(WATCHDOG *wp)
{
    const char *myname = "watchdog_start";

    if (wp != watchdog_curr)
	msg_panic("%s: wrong watchdog instance", myname);
    wp->trip_run = 0;
    alarm(wp->timeout);
    if (msg_verbose > 1)
	msg_info("%s: %p", myname, (void *) wp);
}

/* watchdog_stop - disable watchdog timer */

void    watchdog_stop(WATCHDOG *wp)
{
    const char *myname = "watchdog_stop";

    if (wp != watchdog_curr)
	msg_panic("%s: wrong watchdog instance", myname);
    alarm(0);
    if (msg_verbose > 1)
	msg_info("%s: %p", myname, (void *) wp);
}

/* watchdog_pat - pat the dog so it stays quiet */

void    watchdog_pat(void)
{
    const char *myname = "watchdog_pat";

    if (watchdog_curr)
	watchdog_curr->trip_run = 0;
    if (msg_verbose > 1)
	msg_info("%s: %p", myname, (void *) watchdog_curr);
}

#ifdef TEST

#include <vstream.h>

int     main(int unused_argc, char **unused_argv)
{
    WATCHDOG *wp;

    msg_verbose = 2;

    wp = watchdog_create(10, (WATCHDOG_FN) 0, (void *) 0);
    watchdog_start(wp);
    do {
	watchdog_pat();
    } while (VSTREAM_GETCHAR() != VSTREAM_EOF);
    watchdog_destroy(wp);
    return (0);
}

#endif