summaryrefslogtreecommitdiffstats
path: root/modules/watchdog/watchdog.lua
blob: 6d50be22cb68e3bfade7bcf2594a0687f8b26fec (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
-- SPDX-License-Identifier: GPL-3.0-or-later
local ffi = require('ffi')

ffi.cdef([[
	int sd_watchdog_enabled(int unset_environment, uint64_t *usec);
	int sd_notify(int unset_environment, const char *state);
	void abort(void);
]])

local watchdog = {}
local private = {}

local function sd_signal_ok()
	ffi.C.sd_notify(0, 'WATCHDOG=1')
end

function private.fail_callback()
	log_error(ffi.C.LOG_GRP_WATCHDOG, 'ABORTING resolver, supervisor is expected to restart it')
	ffi.C.abort()
end

-- logging
local function add_tracer(logbuf)
	return function (req)
		local function qrylogger(_, msg)
			jit.off(true, true) -- JIT for (C -> lua)^2 nesting isn't allowed
			table.insert(logbuf, ffi.string(msg))
		end
		req.trace_log = ffi.cast('trace_log_f', qrylogger)
	end
end

local function check_answer(logbuf)
	return function (pkt, req)
		req.trace_log:free()
		if pkt ~= nil and (pkt:rcode() == kres.rcode.NOERROR
							or pkt:rcode() == kres.rcode.NXDOMAIN) then
			private.ok_callback()
			return
		end
		log_info(ffi.C.LOG_GRP_WATCHDOG, 'watchdog query returned unexpected answer! query log:')
		log_info(ffi.C.LOG_GRP_WATCHDOG, table.concat(logbuf, ''))
		if pkt ~= nil then
			log_info(ffi.C.LOG_GRP_WATCHDOG, 'problematic answer:\n%s', pkt)
		else
			log_info(ffi.C.LOG_GRP_WATCHDOG, 'answer was dropped')
		end
		-- failure! quit immediately to allow process supervisor to restart us
		private.fail_callback()
	end
end
private.check_answer_callback = check_answer

local function timer()
	local logbuf = {}
	-- fire watchdog query
	if private.qname and private.qtype then
		log_info(ffi.C.LOG_GRP_WATCHDOG, 'starting watchdog query %s %s', private.qname, private.qtype)
		resolve(private.qname,
			private.qtype,
			kres.class.IN,
			{'TRACE'},
			private.check_answer_callback(logbuf),
			add_tracer(logbuf))
	else
		private.ok_callback()
	end
end

function watchdog.config(cfg)
	-- read only
	if not cfg then
		return private
	end

	local interval = tonumber(cfg.interval or private.interval or 10000)
	if not interval or interval < 1 then
		error('[watchdog] interval must be >= 1 ms')
	end
	private.interval = interval

	-- qname = nil will disable DNS queries
	private.qname = cfg.qname
	private.qtype = cfg.qtype or kres.type.A

	-- restart timers
	watchdog.deinit()
	private.event = event.recurrent(private.interval, timer)
	return private
end

-- automatically enable watchdog if it is configured in systemd
function watchdog.init()
	if private.event then
		error('[watchdog] module is already loaded')
	end
	local timeoutptr = ffi.new('uint64_t[1]')
	local systemd_present, ret = pcall(function() return ffi.C.sd_watchdog_enabled(0, timeoutptr) end)
	if not systemd_present then
		log_info(ffi.C.LOG_GRP_WATCHDOG, 'systemd library not detected')
		return
	end
	private.ok_callback = sd_signal_ok
	if ret < 0 then
		error('[watchdog] %s', ffi.string(ffi.C.knot_strerror(math.abs(ret))))
		return
	elseif ret == 0 then
		log_info(ffi.C.LOG_GRP_WATCHDOG, 'disabled in systemd (WatchdogSec= not specified)')
		return
	end
	local timeout = tonumber(timeoutptr[0]) / 1000  -- convert to ms
	local interval = timeout / 2  -- halve interval to make sure we are never late
	if interval < 1 then
		log_error(ffi.C.LOG_GRP_WATCHDOG, 'error: WatchdogSec= must be at least 2ms! (got %d usec)',
			tonumber(timeoutptr[0]))
	end
	watchdog.config({ interval = interval })
	log_info(ffi.C.LOG_GRP_WATCHDOG, 'systemd watchdog enabled (check interval: %s ms, timeout: %s ms)',
		private.interval, timeout)
end

function watchdog.deinit()
	if private.event then
		event.cancel(private.event)
		private.event = nil
	end
end

return watchdog