diff options
Diffstat (limited to 'modules/watchdog')
-rw-r--r-- | modules/watchdog/.packaging/test.config | 4 | ||||
-rw-r--r-- | modules/watchdog/README.rst | 43 | ||||
-rw-r--r-- | modules/watchdog/watchdog.lua | 137 |
3 files changed, 184 insertions, 0 deletions
diff --git a/modules/watchdog/.packaging/test.config b/modules/watchdog/.packaging/test.config new file mode 100644 index 0000000..9d1a291 --- /dev/null +++ b/modules/watchdog/.packaging/test.config @@ -0,0 +1,4 @@ +-- SPDX-License-Identifier: GPL-3.0-or-later +modules.load('watchdog') +assert(watchdog) +quit() diff --git a/modules/watchdog/README.rst b/modules/watchdog/README.rst new file mode 100644 index 0000000..514f6c0 --- /dev/null +++ b/modules/watchdog/README.rst @@ -0,0 +1,43 @@ +.. SPDX-License-Identifier: GPL-3.0-or-later + +.. _mod-watchdog: + +Watchdog +======== + +This module cooperates with Systemd watchdog to restart the process in case +the internal event loop gets stuck. The upstream Systemd unit files are configured +to use this feature, which is turned on with the ``WatchdogSec=`` directive +in the service file. + +As an optional feature, this module can also do an internal DNS query to check if resolver +answers correctly. To use this feature you must configure DNS name and type to query for: + +.. code-block:: lua + + watchdog.config({ qname = 'nic.cz.', qtype = kres.type.A }) + +Each single query from watchdog must result in answer with +RCODE = NOERROR or NXDOMAIN. Any other result will terminate the resolver +(with SIGABRT) to allow the supervisor process to do cleanup, gather coredump +and restart the resolver. + +It is recommended to use a name with a very short TTL to make sure the watchdog +is testing all parts of resolver and not only its cache. Obviously this check +makes sense only when used with very reliable domains; otherwise a failure +on authoritative side will shutdown resolver! + +`WatchdogSec` specifies deadline for supervisor when the process will be killed. +Watchdog queries are executed each `WatchdogSec / 2` seconds. +This implies that **half** of `WatchdogSec` interval must be long enough for +normal DNS query to succeed, so do not forget to add two or three seconds +for random network timeouts etc. + +The module is loaded by default. If you'd like to disable it you can unload it: + +.. code-block:: lua + + modules.unload('watchdog') + +Beware that unloading the module without disabling watchdog feature in supervisor +will lead to infinite restart loop. diff --git a/modules/watchdog/watchdog.lua b/modules/watchdog/watchdog.lua new file mode 100644 index 0000000..710b8a8 --- /dev/null +++ b/modules/watchdog/watchdog.lua @@ -0,0 +1,137 @@ +-- SPDX-License-Identifier: GPL-3.0-or-later +local ffi = require('ffi') + +ffi.cdef([[ + int sd_watchdog_enabled(int unset_environment, uint64_t *usec); + int sd_notify(int unset_environment, const char *state); + void abort(void); +]]) + +local watchdog = {} +local private = {} + +local function sd_signal_ok() + ffi.C.sd_notify(0, 'WATCHDOG=1') +end + +function private.fail_callback() + log('[watchdog] ABORTING resolver, supervisor is expected to restart it') + ffi.C.abort() +end + +-- logging +local function add_tracer(logbuf) + return function (req) + local function qrylogger(_, msg) + jit.off(true, true) -- JIT for (C -> lua)^2 nesting isn't allowed + table.insert(logbuf, ffi.string(msg)) + end + req.trace_log = ffi.cast('trace_log_f', qrylogger) + end +end + +local function check_answer(logbuf) + return function (pkt, req) + req.trace_log:free() + if pkt ~= nil and (pkt:rcode() == kres.rcode.NOERROR + or pkt:rcode() == kres.rcode.NXDOMAIN) then + private.ok_callback() + return + end + log('[watchdog] watchdog query returned unexpected answer! query verbose log:') + log(table.concat(logbuf, '')) + if pkt ~= nil then + log('[watchdog] problematic answer:\n%s', pkt) + else + log('[watchdog] answer was dropped') + end + -- failure! quit immediatelly to allow process supervisor to restart us + private.fail_callback() + end +end +private.check_answer_callback = check_answer + +local function timer() + local logbuf = {} + -- fire watchdog query + if private.qname and private.qtype then + if verbose() then + log('[watchdog] starting watchdog query %s %s', private.qname, private.qtype) + end + resolve(private.qname, + private.qtype, + kres.class.IN, + {'TRACE'}, + private.check_answer_callback(logbuf), + add_tracer(logbuf)) + else + private.ok_callback() + end +end + +function watchdog.config(cfg) + -- read only + if not cfg then + return private + end + + local interval = tonumber(cfg.interval or private.interval or 10000) + if not interval or interval < 1 then + error('[watchdog] interval must be >= 1 ms') + end + private.interval = interval + + -- qname = nil will disable DNS queries + private.qname = cfg.qname + private.qtype = cfg.qtype or kres.type.A + + -- restart timers + watchdog.deinit() + private.event = event.recurrent(private.interval, timer) + return private +end + +-- automatically enable watchdog if it is configured in systemd +function watchdog.init() + if private.event then + error('[watchdog] module is already loaded') + end + local timeoutptr = ffi.new('uint64_t[1]') + local systemd_present, ret = pcall(function() return ffi.C.sd_watchdog_enabled(0, timeoutptr) end) + if not systemd_present then + if verbose() then + log('[watchdog] systemd library not detected') + end + return + end + private.ok_callback = sd_signal_ok + if ret < 0 then + error('[watchdog] %s', ffi.string(ffi.C.knot_strerror(math.abs(ret)))) + return + elseif ret == 0 then + if verbose() then + log('[watchdog] disabled in systemd (WatchdogSec= not specified)') + end + return + end + local timeout = tonumber(timeoutptr[0]) / 1000 -- convert to ms + local interval = timeout / 2 -- halve interval to make sure we are never late + if interval < 1 then + log('[watchdog] error: WatchdogSec= must be at least 2ms! (got %d usec)', + tonumber(timeoutptr[0])) + end + watchdog.config({ interval = interval }) + if verbose() then + log('[watchdog] systemd watchdog enabled (check interval: %s ms, timeout: %s ms)', + private.interval, timeout) + end +end + +function watchdog.deinit() + if private.event then + event.cancel(private.event) + private.event = nil + end +end + +return watchdog |