summaryrefslogtreecommitdiffstats
path: root/modules/http/prometheus.lua
diff options
context:
space:
mode:
Diffstat (limited to 'modules/http/prometheus.lua')
-rw-r--r--modules/http/prometheus.lua178
1 files changed, 178 insertions, 0 deletions
diff --git a/modules/http/prometheus.lua b/modules/http/prometheus.lua
new file mode 100644
index 0000000..3218552
--- /dev/null
+++ b/modules/http/prometheus.lua
@@ -0,0 +1,178 @@
+-- SPDX-License-Identifier: GPL-3.0-or-later
+-- Module implementation
+local M = {
+ namespace = '',
+ finalize = function (_ --[[metrics]]) end,
+}
+
+-- Gauge metrics
+local gauges = {
+ ['worker.concurrent'] = true,
+ ['worker.rss'] = true,
+}
+
+local function merge(t, results, prefix)
+ for _, result in pairs(results) do
+ if type(result) == 'table' then
+ for k, v in pairs(result) do
+ local val = t[prefix..k]
+ t[prefix..k] = (val or 0) + v
+ end
+ end
+ end
+end
+
+local function getstats()
+ local t = {}
+ merge(t, map 'stats.list()', '')
+ merge(t, map 'cache.stats()', 'cache.')
+ merge(t, map 'worker.stats()', 'worker.')
+ return t
+end
+
+-- @returns current stats + difference against previous data set passed in @param prev
+local function snapshot_start(prev)
+ assert(type(prev) == 'table', 'table with previous values expected')
+ local is_empty = true
+ -- Get current snapshot
+ local cur, stats_dt = getstats(), {}
+ for k,v in pairs(cur) do
+ if gauges[k] then
+ stats_dt[k] = v
+ else
+ stats_dt[k] = v - (prev[k] or 0)
+ end
+ is_empty = is_empty and stats_dt[k] == 0
+ end
+ -- Calculate upstreams and geotag them if possible
+ local upstreams
+ if http.geoip then
+ upstreams = stats.upstreams()
+ for k,v in pairs(upstreams) do
+ local gi
+ if string.find(k, '.', 1, true) then
+ gi = http.geoip:search_ipv4(k)
+ else
+ gi = http.geoip:search_ipv6(k)
+ end
+ if gi then
+ upstreams[k] = {data=v, location=gi.location, country=gi.country and gi.country.iso_code}
+ end
+ end
+ end
+ -- Aggregate per-worker metrics
+ local wdata = {}
+ for _, info in pairs(map 'worker.info()') do
+ if type(info) == 'table' then
+ wdata[tostring(info.pid)] = {
+ rss = info.rss,
+ usertime = info.usertime,
+ systime = info.systime,
+ pagefaults = info.pagefaults,
+ queries = info.queries
+ }
+ end
+ end
+ -- Publish stats updates periodically
+ if not is_empty then
+ local update = {time=os.time(), stats=stats_dt, upstreams=upstreams, workers=wdata}
+ return cur, update
+ end
+ return cur, nil
+end
+
+-- Function to sort frequency list
+local function stream_stats(_, ws)
+ local ok = true
+ -- Publish stats updates periodically
+ local prev = getstats()
+ while ok do
+ worker.sleep(1)
+ local update
+ prev, update = snapshot_start(prev)
+ local push = tojson(update)
+ ok = ws:send(push)
+ end
+end
+
+-- Transform metrics from Graphite to Prometheus format
+-- See: https://gitlab.nic.cz/knot/knot-resolver/-/issues/650
+-- E.g.:
+-- worker.ipv4 -> worker_ipv4
+-- answer.blocked;stype=A -> answer_blocked{stype="A"}
+local function get_metric(key)
+ local key_index, key_len, key_tag = 0, #key, 0
+ return select(1, key:gsub('.', function (c)
+ key_index = key_index + 1
+ if key_tag == 0 then
+ if c == '.' then return '_' end
+ if c == ';' then key_tag = 1; return '{' end
+ elseif key_tag == 1 then
+ if key_index == key_len then
+ if c == '=' then return '=""}'
+ else return c .. '"}' end
+ end
+ if c == '=' then key_tag = 2; return '="' end
+ elseif key_tag == 2 then
+ if key_index == key_len then
+ if c == ';' then return '"}'
+ else return c .. '"}' end
+ end
+ if c == ';' then key_tag = 1; return '",' end
+ end
+ return nil
+ end))
+end
+
+-- Render stats in Prometheus text format
+local function serve_prometheus()
+ -- First aggregate metrics list and print counters
+ local slist, render = getstats(), {}
+ local latency = {}
+ local counter = '# TYPE %s counter\n%s %f'
+ for k,v in pairs(slist) do
+ k = get_metric(k)
+ -- Aggregate histograms
+ local band = k:match('answer_([%d]+)ms')
+ if band then
+ table.insert(latency, {band, v})
+ elseif k == 'answer_slow' then
+ table.insert(latency, {'+Inf', v})
+ -- Counter as a fallback
+ else
+ local key = M.namespace .. k
+ local name, label = key:match('^([^{]+)(.*)$')
+ table.insert(render, string.format(counter, name, name .. label, v))
+ end
+ end
+ -- Fill in latency histogram
+ local function kweight(x) return tonumber(x) or math.huge end
+ table.sort(latency, function (a,b) return kweight(a[1]) < kweight(b[1]) end)
+ table.insert(render, string.format('# TYPE %slatency histogram', M.namespace))
+ local count, sum = 0.0, 0.0
+ for _,e in ipairs(latency) do
+ -- The information about the %Inf bin is lost, so we treat it
+ -- as a timeout (3000ms) for metrics purposes
+ count = count + e[2]
+ sum = sum + e[2] * (math.min(tonumber(e[1]), 3000.0))
+ table.insert(render, string.format('%slatency_bucket{le="%s"} %f', M.namespace, e[1], count))
+ end
+ table.insert(render, string.format('%slatency_count %f', M.namespace, count))
+ table.insert(render, string.format('%slatency_sum %f', M.namespace, sum))
+ -- Finalize metrics table before rendering
+ if type(M.finalize) == 'function' then
+ M.finalize(render)
+ end
+ return table.concat(render, '\n') .. '\n'
+end
+
+-- Export module interface
+M.endpoints = {
+ ['/stats'] = {'application/json', getstats, stream_stats},
+ ['/frequent'] = {'application/json', function () return stats.frequent() end},
+ ['/upstreams'] = {'application/json', function () return stats.upstreams() end},
+ ['/bogus'] = {'application/json', function () return bogus_log.frequent() end},
+ ['/metrics'] = {'text/plain; version=0.0.4', serve_prometheus},
+}
+
+return M