From b485aab7e71c1625cfc27e0f92c9509f42378458 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 5 May 2024 13:19:16 +0200 Subject: Adding upstream version 1.45.3+dfsg. Signed-off-by: Daniel Baumann --- src/health/health.d/ml.conf | 49 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 src/health/health.d/ml.conf (limited to 'src/health/health.d/ml.conf') diff --git a/src/health/health.d/ml.conf b/src/health/health.d/ml.conf new file mode 100644 index 000000000..b6a5df6dd --- /dev/null +++ b/src/health/health.d/ml.conf @@ -0,0 +1,49 @@ +# below are some examples of using the `anomaly-bit` option to define alerts based on anomaly +# rates as opposed to raw metric values. You can read more about the anomaly-bit and Netdata's +# native anomaly detection here: +# https://learn.netdata.cloud/docs/agent/ml#anomaly-bit---100--anomalous-0--normal + +# some examples below are commented, you would need to uncomment and adjust as desired to enable them. + +# node level anomaly rate +# https://learn.netdata.cloud/docs/agent/ml#node-anomaly-rate +# if node level anomaly rate is above 1% then warning (pick your own threshold that works best via trial and error). + template: ml_1min_node_ar + on: anomaly_detection.anomaly_rate + class: Workload + type: System +component: ML + lookup: average -1m of anomaly_rate + calc: $this + units: % + every: 30s + warn: $this > 1 + summary: ML node anomaly rate + info: Rolling 1min node level anomaly rate + to: silent + +# alert per dimension example +# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error). +# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error). +# template: ml_5min_cpu_dims +# on: system.cpu +# lookup: average -5m anomaly-bit foreach * +# calc: $this +# units: % +# every: 30s +# warn: $this > (($status >= $WARNING) ? (5) : (20)) +# crit: $this > (($status == $CRITICAL) ? (20) : (100)) +# info: rolling 5min anomaly rate for each system.cpu dimension + +# alert per chart example +# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error). +# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error). +# template: ml_5min_cpu_chart +# on: system.cpu +# lookup: average -5m anomaly-bit of * +# calc: $this +# units: % +# every: 30s +# warn: $this > (($status >= $WARNING) ? (5) : (20)) +# crit: $this > (($status == $CRITICAL) ? (20) : (100)) +# info: rolling 5min anomaly rate for system.cpu chart -- cgit v1.2.3