diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 10:41:58 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 10:41:58 +0000 |
commit | 1852910ef0fd7393da62b88aee66ee092208748e (patch) | |
tree | ad3b659dbbe622b58a5bda4fe0b5e1d80eee9277 /modules/http/prometheus.lua | |
parent | Initial commit. (diff) | |
download | knot-resolver-upstream.tar.xz knot-resolver-upstream.zip |
Adding upstream version 5.3.1.upstream/5.3.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'modules/http/prometheus.lua')
-rw-r--r-- | modules/http/prometheus.lua | 178 |
1 files changed, 178 insertions, 0 deletions
diff --git a/modules/http/prometheus.lua b/modules/http/prometheus.lua new file mode 100644 index 0000000..3218552 --- /dev/null +++ b/modules/http/prometheus.lua @@ -0,0 +1,178 @@ +-- SPDX-License-Identifier: GPL-3.0-or-later +-- Module implementation +local M = { + namespace = '', + finalize = function (_ --[[metrics]]) end, +} + +-- Gauge metrics +local gauges = { + ['worker.concurrent'] = true, + ['worker.rss'] = true, +} + +local function merge(t, results, prefix) + for _, result in pairs(results) do + if type(result) == 'table' then + for k, v in pairs(result) do + local val = t[prefix..k] + t[prefix..k] = (val or 0) + v + end + end + end +end + +local function getstats() + local t = {} + merge(t, map 'stats.list()', '') + merge(t, map 'cache.stats()', 'cache.') + merge(t, map 'worker.stats()', 'worker.') + return t +end + +-- @returns current stats + difference against previous data set passed in @param prev +local function snapshot_start(prev) + assert(type(prev) == 'table', 'table with previous values expected') + local is_empty = true + -- Get current snapshot + local cur, stats_dt = getstats(), {} + for k,v in pairs(cur) do + if gauges[k] then + stats_dt[k] = v + else + stats_dt[k] = v - (prev[k] or 0) + end + is_empty = is_empty and stats_dt[k] == 0 + end + -- Calculate upstreams and geotag them if possible + local upstreams + if http.geoip then + upstreams = stats.upstreams() + for k,v in pairs(upstreams) do + local gi + if string.find(k, '.', 1, true) then + gi = http.geoip:search_ipv4(k) + else + gi = http.geoip:search_ipv6(k) + end + if gi then + upstreams[k] = {data=v, location=gi.location, country=gi.country and gi.country.iso_code} + end + end + end + -- Aggregate per-worker metrics + local wdata = {} + for _, info in pairs(map 'worker.info()') do + if type(info) == 'table' then + wdata[tostring(info.pid)] = { + rss = info.rss, + usertime = info.usertime, + systime = info.systime, + pagefaults = info.pagefaults, + queries = info.queries + } + end + end + -- Publish stats updates periodically + if not is_empty then + local update = {time=os.time(), stats=stats_dt, upstreams=upstreams, workers=wdata} + return cur, update + end + return cur, nil +end + +-- Function to sort frequency list +local function stream_stats(_, ws) + local ok = true + -- Publish stats updates periodically + local prev = getstats() + while ok do + worker.sleep(1) + local update + prev, update = snapshot_start(prev) + local push = tojson(update) + ok = ws:send(push) + end +end + +-- Transform metrics from Graphite to Prometheus format +-- See: https://gitlab.nic.cz/knot/knot-resolver/-/issues/650 +-- E.g.: +-- worker.ipv4 -> worker_ipv4 +-- answer.blocked;stype=A -> answer_blocked{stype="A"} +local function get_metric(key) + local key_index, key_len, key_tag = 0, #key, 0 + return select(1, key:gsub('.', function (c) + key_index = key_index + 1 + if key_tag == 0 then + if c == '.' then return '_' end + if c == ';' then key_tag = 1; return '{' end + elseif key_tag == 1 then + if key_index == key_len then + if c == '=' then return '=""}' + else return c .. '"}' end + end + if c == '=' then key_tag = 2; return '="' end + elseif key_tag == 2 then + if key_index == key_len then + if c == ';' then return '"}' + else return c .. '"}' end + end + if c == ';' then key_tag = 1; return '",' end + end + return nil + end)) +end + +-- Render stats in Prometheus text format +local function serve_prometheus() + -- First aggregate metrics list and print counters + local slist, render = getstats(), {} + local latency = {} + local counter = '# TYPE %s counter\n%s %f' + for k,v in pairs(slist) do + k = get_metric(k) + -- Aggregate histograms + local band = k:match('answer_([%d]+)ms') + if band then + table.insert(latency, {band, v}) + elseif k == 'answer_slow' then + table.insert(latency, {'+Inf', v}) + -- Counter as a fallback + else + local key = M.namespace .. k + local name, label = key:match('^([^{]+)(.*)$') + table.insert(render, string.format(counter, name, name .. label, v)) + end + end + -- Fill in latency histogram + local function kweight(x) return tonumber(x) or math.huge end + table.sort(latency, function (a,b) return kweight(a[1]) < kweight(b[1]) end) + table.insert(render, string.format('# TYPE %slatency histogram', M.namespace)) + local count, sum = 0.0, 0.0 + for _,e in ipairs(latency) do + -- The information about the %Inf bin is lost, so we treat it + -- as a timeout (3000ms) for metrics purposes + count = count + e[2] + sum = sum + e[2] * (math.min(tonumber(e[1]), 3000.0)) + table.insert(render, string.format('%slatency_bucket{le="%s"} %f', M.namespace, e[1], count)) + end + table.insert(render, string.format('%slatency_count %f', M.namespace, count)) + table.insert(render, string.format('%slatency_sum %f', M.namespace, sum)) + -- Finalize metrics table before rendering + if type(M.finalize) == 'function' then + M.finalize(render) + end + return table.concat(render, '\n') .. '\n' +end + +-- Export module interface +M.endpoints = { + ['/stats'] = {'application/json', getstats, stream_stats}, + ['/frequent'] = {'application/json', function () return stats.frequent() end}, + ['/upstreams'] = {'application/json', function () return stats.upstreams() end}, + ['/bogus'] = {'application/json', function () return bogus_log.frequent() end}, + ['/metrics'] = {'text/plain; version=0.0.4', serve_prometheus}, +} + +return M |