178 lines
5.1 KiB
Lua
178 lines
5.1 KiB
Lua
-- SPDX-License-Identifier: GPL-3.0-or-later
|
|
-- Module implementation
|
|
local M = {
|
|
namespace = '',
|
|
finalize = function (_ --[[metrics]]) end,
|
|
}
|
|
|
|
-- Gauge metrics
|
|
local gauges = {
|
|
['worker.concurrent'] = true,
|
|
['worker.rss'] = true,
|
|
}
|
|
|
|
local function merge(t, results, prefix)
|
|
for _, result in pairs(results) do
|
|
if type(result) == 'table' then
|
|
for k, v in pairs(result) do
|
|
local val = t[prefix..k]
|
|
t[prefix..k] = (val or 0) + v
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
local function getstats()
|
|
local t = {}
|
|
merge(t, map 'stats.list()', '')
|
|
merge(t, map 'cache.stats()', 'cache.')
|
|
merge(t, map 'worker.stats()', 'worker.')
|
|
return t
|
|
end
|
|
|
|
-- @returns current stats + difference against previous data set passed in @param prev
|
|
local function snapshot_start(prev)
|
|
assert(type(prev) == 'table', 'table with previous values expected')
|
|
local is_empty = true
|
|
-- Get current snapshot
|
|
local cur, stats_dt = getstats(), {}
|
|
for k,v in pairs(cur) do
|
|
if gauges[k] then
|
|
stats_dt[k] = v
|
|
else
|
|
stats_dt[k] = v - (prev[k] or 0)
|
|
end
|
|
is_empty = is_empty and stats_dt[k] == 0
|
|
end
|
|
-- Calculate upstreams and geotag them if possible
|
|
local upstreams
|
|
if http.geoip then
|
|
upstreams = stats.upstreams()
|
|
for k,v in pairs(upstreams) do
|
|
local gi
|
|
if string.find(k, '.', 1, true) then
|
|
gi = http.geoip:search_ipv4(k)
|
|
else
|
|
gi = http.geoip:search_ipv6(k)
|
|
end
|
|
if gi then
|
|
upstreams[k] = {data=v, location=gi.location, country=gi.country and gi.country.iso_code}
|
|
end
|
|
end
|
|
end
|
|
-- Aggregate per-worker metrics
|
|
local wdata = {}
|
|
for _, info in pairs(map 'worker.info()') do
|
|
if type(info) == 'table' then
|
|
wdata[tostring(info.pid)] = {
|
|
rss = info.rss,
|
|
usertime = info.usertime,
|
|
systime = info.systime,
|
|
pagefaults = info.pagefaults,
|
|
queries = info.queries
|
|
}
|
|
end
|
|
end
|
|
-- Publish stats updates periodically
|
|
if not is_empty then
|
|
local update = {time=os.time(), stats=stats_dt, upstreams=upstreams, workers=wdata}
|
|
return cur, update
|
|
end
|
|
return cur, nil
|
|
end
|
|
|
|
-- Function to sort frequency list
|
|
local function stream_stats(_, ws)
|
|
local ok = true
|
|
-- Publish stats updates periodically
|
|
local prev = getstats()
|
|
while ok do
|
|
worker.sleep(1)
|
|
local update
|
|
prev, update = snapshot_start(prev)
|
|
local push = tojson(update)
|
|
ok = ws:send(push)
|
|
end
|
|
end
|
|
|
|
-- Transform metrics from Graphite to Prometheus format
|
|
-- See: https://gitlab.nic.cz/knot/knot-resolver/-/issues/650
|
|
-- E.g.:
|
|
-- worker.ipv4 -> worker_ipv4
|
|
-- answer.blocked;stype=A -> answer_blocked{stype="A"}
|
|
local function get_metric(key)
|
|
local key_index, key_len, key_tag = 0, #key, 0
|
|
return select(1, key:gsub('.', function (c)
|
|
key_index = key_index + 1
|
|
if key_tag == 0 then
|
|
if c == '.' then return '_' end
|
|
if c == ';' then key_tag = 1; return '{' end
|
|
elseif key_tag == 1 then
|
|
if key_index == key_len then
|
|
if c == '=' then return '=""}'
|
|
else return c .. '"}' end
|
|
end
|
|
if c == '=' then key_tag = 2; return '="' end
|
|
elseif key_tag == 2 then
|
|
if key_index == key_len then
|
|
if c == ';' then return '"}'
|
|
else return c .. '"}' end
|
|
end
|
|
if c == ';' then key_tag = 1; return '",' end
|
|
end
|
|
return nil
|
|
end))
|
|
end
|
|
|
|
-- Render stats in Prometheus text format
|
|
local function serve_prometheus()
|
|
-- First aggregate metrics list and print counters
|
|
local slist, render = getstats(), {}
|
|
local latency = {}
|
|
local counter = '# TYPE %s counter\n%s %f'
|
|
for k,v in pairs(slist) do
|
|
k = get_metric(k)
|
|
-- Aggregate histograms
|
|
local band = k:match('answer_([%d]+)ms')
|
|
if band then
|
|
table.insert(latency, {band, v})
|
|
elseif k == 'answer_slow' then
|
|
table.insert(latency, {'+Inf', v})
|
|
-- Counter as a fallback
|
|
else
|
|
local key = M.namespace .. k
|
|
local name, label = key:match('^([^{]+)(.*)$')
|
|
table.insert(render, string.format(counter, name, name .. label, v))
|
|
end
|
|
end
|
|
-- Fill in latency histogram
|
|
local function kweight(x) return tonumber(x) or math.huge end
|
|
table.sort(latency, function (a,b) return kweight(a[1]) < kweight(b[1]) end)
|
|
table.insert(render, string.format('# TYPE %slatency histogram', M.namespace))
|
|
local count, sum = 0.0, 0.0
|
|
for _,e in ipairs(latency) do
|
|
-- The information about the %Inf bin is lost, so we treat it
|
|
-- as a timeout (3000ms) for metrics purposes
|
|
count = count + e[2]
|
|
sum = sum + e[2] * (math.min(tonumber(e[1]), 3000.0))
|
|
table.insert(render, string.format('%slatency_bucket{le="%s"} %f', M.namespace, e[1], count))
|
|
end
|
|
table.insert(render, string.format('%slatency_count %f', M.namespace, count))
|
|
table.insert(render, string.format('%slatency_sum %f', M.namespace, sum))
|
|
-- Finalize metrics table before rendering
|
|
if type(M.finalize) == 'function' then
|
|
M.finalize(render)
|
|
end
|
|
return table.concat(render, '\n') .. '\n'
|
|
end
|
|
|
|
-- Export module interface
|
|
M.endpoints = {
|
|
['/stats'] = {'application/json', getstats, stream_stats},
|
|
['/frequent'] = {'application/json', function () return stats.frequent() end},
|
|
['/upstreams'] = {'application/json', function () return stats.upstreams() end},
|
|
['/bogus'] = {'application/json', function () return bogus_log.frequent() end},
|
|
['/metrics'] = {'text/plain; version=0.0.4', serve_prometheus},
|
|
}
|
|
|
|
return M
|