1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
|
-- SPDX-License-Identifier: GPL-3.0-or-later
local ffi = require('ffi')
ffi.cdef([[
int sd_watchdog_enabled(int unset_environment, uint64_t *usec);
int sd_notify(int unset_environment, const char *state);
void abort(void);
]])
local watchdog = {}
local private = {}
local function sd_signal_ok()
ffi.C.sd_notify(0, 'WATCHDOG=1')
end
function private.fail_callback()
log_error(ffi.C.LOG_GRP_WATCHDOG, 'ABORTING resolver, supervisor is expected to restart it')
ffi.C.abort()
end
-- logging
local function add_tracer(logbuf)
return function (req)
local function qrylogger(_, msg)
jit.off(true, true) -- JIT for (C -> lua)^2 nesting isn't allowed
table.insert(logbuf, ffi.string(msg))
end
req.trace_log = ffi.cast('trace_log_f', qrylogger)
end
end
local function check_answer(logbuf)
return function (pkt, req)
req.trace_log:free()
if pkt ~= nil and (pkt:rcode() == kres.rcode.NOERROR
or pkt:rcode() == kres.rcode.NXDOMAIN) then
private.ok_callback()
return
end
log_info(ffi.C.LOG_GRP_WATCHDOG, 'watchdog query returned unexpected answer! query log:')
log_info(ffi.C.LOG_GRP_WATCHDOG, table.concat(logbuf, ''))
if pkt ~= nil then
log_info(ffi.C.LOG_GRP_WATCHDOG, 'problematic answer:\n%s', pkt)
else
log_info(ffi.C.LOG_GRP_WATCHDOG, 'answer was dropped')
end
-- failure! quit immediately to allow process supervisor to restart us
private.fail_callback()
end
end
private.check_answer_callback = check_answer
local function timer()
local logbuf = {}
-- fire watchdog query
if private.qname and private.qtype then
log_info(ffi.C.LOG_GRP_WATCHDOG, 'starting watchdog query %s %s', private.qname, private.qtype)
resolve(private.qname,
private.qtype,
kres.class.IN,
{'TRACE'},
private.check_answer_callback(logbuf),
add_tracer(logbuf))
else
private.ok_callback()
end
end
function watchdog.config(cfg)
-- read only
if not cfg then
return private
end
local interval = tonumber(cfg.interval or private.interval or 10000)
if not interval or interval < 1 then
error('[watchdog] interval must be >= 1 ms')
end
private.interval = interval
-- qname = nil will disable DNS queries
private.qname = cfg.qname
private.qtype = cfg.qtype or kres.type.A
-- restart timers
watchdog.deinit()
private.event = event.recurrent(private.interval, timer)
return private
end
-- automatically enable watchdog if it is configured in systemd
function watchdog.init()
if private.event then
error('[watchdog] module is already loaded')
end
local timeoutptr = ffi.new('uint64_t[1]')
local systemd_present, ret = pcall(function() return ffi.C.sd_watchdog_enabled(0, timeoutptr) end)
if not systemd_present then
log_info(ffi.C.LOG_GRP_WATCHDOG, 'systemd library not detected')
return
end
private.ok_callback = sd_signal_ok
if ret < 0 then
error('[watchdog] %s', ffi.string(ffi.C.knot_strerror(math.abs(ret))))
return
elseif ret == 0 then
log_info(ffi.C.LOG_GRP_WATCHDOG, 'disabled in systemd (WatchdogSec= not specified)')
return
end
local timeout = tonumber(timeoutptr[0]) / 1000 -- convert to ms
local interval = timeout / 2 -- halve interval to make sure we are never late
if interval < 1 then
log_error(ffi.C.LOG_GRP_WATCHDOG, 'error: WatchdogSec= must be at least 2ms! (got %d usec)',
tonumber(timeoutptr[0]))
end
watchdog.config({ interval = interval })
log_info(ffi.C.LOG_GRP_WATCHDOG, 'systemd watchdog enabled (check interval: %s ms, timeout: %s ms)',
private.interval, timeout)
end
function watchdog.deinit()
if private.event then
event.cancel(private.event)
private.event = nil
end
end
return watchdog
|