summaryrefslogtreecommitdiffstats
path: root/modules/watchdog/watchdog.lua
blob: 710b8a8c4cfd34d72c1ba91a0db2f09d4be00e53 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
-- SPDX-License-Identifier: GPL-3.0-or-later
local ffi = require('ffi')

ffi.cdef([[
	int sd_watchdog_enabled(int unset_environment, uint64_t *usec);
	int sd_notify(int unset_environment, const char *state);
	void abort(void);
]])

local watchdog = {}
local private = {}

local function sd_signal_ok()
	ffi.C.sd_notify(0, 'WATCHDOG=1')
end

function private.fail_callback()
	log('[watchdog] ABORTING resolver, supervisor is expected to restart it')
	ffi.C.abort()
end

-- logging
local function add_tracer(logbuf)
	return function (req)
		local function qrylogger(_, msg)
			jit.off(true, true) -- JIT for (C -> lua)^2 nesting isn't allowed
			table.insert(logbuf, ffi.string(msg))
		end
		req.trace_log = ffi.cast('trace_log_f', qrylogger)
	end
end

local function check_answer(logbuf)
	return function (pkt, req)
		req.trace_log:free()
		if pkt ~= nil and (pkt:rcode() == kres.rcode.NOERROR
							or pkt:rcode() == kres.rcode.NXDOMAIN) then
			private.ok_callback()
			return
		end
		log('[watchdog] watchdog query returned unexpected answer! query verbose log:')
		log(table.concat(logbuf, ''))
		if pkt ~= nil then
			log('[watchdog] problematic answer:\n%s', pkt)
		else
			log('[watchdog] answer was dropped')
		end
		-- failure! quit immediatelly to allow process supervisor to restart us
		private.fail_callback()
	end
end
private.check_answer_callback = check_answer

local function timer()
	local logbuf = {}
	-- fire watchdog query
	if private.qname and private.qtype then
		if verbose() then
			log('[watchdog] starting watchdog query %s %s', private.qname, private.qtype)
		end
		resolve(private.qname,
			private.qtype,
			kres.class.IN,
			{'TRACE'},
			private.check_answer_callback(logbuf),
			add_tracer(logbuf))
	else
		private.ok_callback()
	end
end

function watchdog.config(cfg)
	-- read only
	if not cfg then
		return private
	end

	local interval = tonumber(cfg.interval or private.interval or 10000)
	if not interval or interval < 1 then
		error('[watchdog] interval must be >= 1 ms')
	end
	private.interval = interval

	-- qname = nil will disable DNS queries
	private.qname = cfg.qname
	private.qtype = cfg.qtype or kres.type.A

	-- restart timers
	watchdog.deinit()
	private.event = event.recurrent(private.interval, timer)
	return private
end

-- automatically enable watchdog if it is configured in systemd
function watchdog.init()
	if private.event then
		error('[watchdog] module is already loaded')
	end
	local timeoutptr = ffi.new('uint64_t[1]')
	local systemd_present, ret = pcall(function() return ffi.C.sd_watchdog_enabled(0, timeoutptr) end)
	if not systemd_present then
		if verbose() then
			log('[watchdog] systemd library not detected')
		end
		return
	end
	private.ok_callback = sd_signal_ok
	if ret < 0 then
		error('[watchdog] %s', ffi.string(ffi.C.knot_strerror(math.abs(ret))))
		return
	elseif ret == 0 then
		if verbose() then
			log('[watchdog] disabled in systemd (WatchdogSec= not specified)')
		end
		return
	end
	local timeout = tonumber(timeoutptr[0]) / 1000  -- convert to ms
	local interval = timeout / 2  -- halve interval to make sure we are never late
	if interval < 1 then
		log('[watchdog] error: WatchdogSec= must be at least 2ms! (got %d usec)',
			tonumber(timeoutptr[0]))
	end
	watchdog.config({ interval = interval })
	if verbose() then
		log('[watchdog] systemd watchdog enabled (check interval: %s ms, timeout: %s ms)',
			private.interval, timeout)
	end
end

function watchdog.deinit()
	if private.event then
		event.cancel(private.event)
		private.event = nil
	end
end

return watchdog