1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
|
-- SPDX-License-Identifier: GPL-3.0-or-later
local ffi = require('ffi')
ffi.cdef([[
int sd_watchdog_enabled(int unset_environment, uint64_t *usec);
int sd_notify(int unset_environment, const char *state);
void abort(void);
]])
local watchdog = {}
local private = {}
local function sd_signal_ok()
ffi.C.sd_notify(0, 'WATCHDOG=1')
end
function private.fail_callback()
log('[watchdog] ABORTING resolver, supervisor is expected to restart it')
ffi.C.abort()
end
-- logging
local function add_tracer(logbuf)
return function (req)
local function qrylogger(_, msg)
jit.off(true, true) -- JIT for (C -> lua)^2 nesting isn't allowed
table.insert(logbuf, ffi.string(msg))
end
req.trace_log = ffi.cast('trace_log_f', qrylogger)
end
end
local function check_answer(logbuf)
return function (pkt, req)
req.trace_log:free()
if pkt ~= nil and (pkt:rcode() == kres.rcode.NOERROR
or pkt:rcode() == kres.rcode.NXDOMAIN) then
private.ok_callback()
return
end
log('[watchdog] watchdog query returned unexpected answer! query verbose log:')
log(table.concat(logbuf, ''))
if pkt ~= nil then
log('[watchdog] problematic answer:\n%s', pkt)
else
log('[watchdog] answer was dropped')
end
-- failure! quit immediatelly to allow process supervisor to restart us
private.fail_callback()
end
end
private.check_answer_callback = check_answer
local function timer()
local logbuf = {}
-- fire watchdog query
if private.qname and private.qtype then
if verbose() then
log('[watchdog] starting watchdog query %s %s', private.qname, private.qtype)
end
resolve(private.qname,
private.qtype,
kres.class.IN,
{'TRACE'},
private.check_answer_callback(logbuf),
add_tracer(logbuf))
else
private.ok_callback()
end
end
function watchdog.config(cfg)
-- read only
if not cfg then
return private
end
local interval = tonumber(cfg.interval or private.interval or 10000)
if not interval or interval < 1 then
error('[watchdog] interval must be >= 1 ms')
end
private.interval = interval
-- qname = nil will disable DNS queries
private.qname = cfg.qname
private.qtype = cfg.qtype or kres.type.A
-- restart timers
watchdog.deinit()
private.event = event.recurrent(private.interval, timer)
return private
end
-- automatically enable watchdog if it is configured in systemd
function watchdog.init()
if private.event then
error('[watchdog] module is already loaded')
end
local timeoutptr = ffi.new('uint64_t[1]')
local systemd_present, ret = pcall(function() return ffi.C.sd_watchdog_enabled(0, timeoutptr) end)
if not systemd_present then
if verbose() then
log('[watchdog] systemd library not detected')
end
return
end
private.ok_callback = sd_signal_ok
if ret < 0 then
error('[watchdog] %s', ffi.string(ffi.C.knot_strerror(math.abs(ret))))
return
elseif ret == 0 then
if verbose() then
log('[watchdog] disabled in systemd (WatchdogSec= not specified)')
end
return
end
local timeout = tonumber(timeoutptr[0]) / 1000 -- convert to ms
local interval = timeout / 2 -- halve interval to make sure we are never late
if interval < 1 then
log('[watchdog] error: WatchdogSec= must be at least 2ms! (got %d usec)',
tonumber(timeoutptr[0]))
end
watchdog.config({ interval = interval })
if verbose() then
log('[watchdog] systemd watchdog enabled (check interval: %s ms, timeout: %s ms)',
private.interval, timeout)
end
end
function watchdog.deinit()
if private.event then
event.cancel(private.event)
private.event = nil
end
end
return watchdog
|