From c69cb8cc094cc916adbc516b09e944cd3d137c01 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 27 Apr 2024 13:08:07 +0200 Subject: Adding upstream version 1.29.3. Signed-off-by: Daniel Baumann --- health/health.d/dbengine.conf | 50 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 health/health.d/dbengine.conf (limited to 'health/health.d/dbengine.conf') diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf new file mode 100644 index 0000000..274673e --- /dev/null +++ b/health/health.d/dbengine.conf @@ -0,0 +1,50 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: 10min_dbengine_global_fs_errors + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * +lookup: sum -10m unaligned of fs_errors + units: errors + every: 10s + crit: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc) + to: sysadmin + + alarm: 10min_dbengine_global_io_errors + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * +lookup: sum -10m unaligned of io_errors + units: errors + every: 10s + crit: $this > 0 + delay: down 1h multiplier 1.5 max 3h + info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc) + to: sysadmin + + alarm: 10min_dbengine_global_flushing_warnings + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * +lookup: sum -10m unaligned of pg_cache_over_half_dirty_events + units: errors + every: 10s + warn: $this > 0 + delay: down 1h multiplier 1.5 max 3h + info: number of times in the last 10 minutes that dbengine dirty pages were over 50% of the instance's page cache, metric data at risk of not being stored in the database, please reduce disk load or use faster disks + to: sysadmin + + alarm: 10min_dbengine_global_flushing_errors + on: netdata.dbengine_long_term_page_stats + os: linux freebsd macos + hosts: * +lookup: sum -10m unaligned of flushing_pressure_deletions + units: pages + every: 10s + crit: $this != 0 + delay: down 1h multiplier 1.5 max 3h + info: number of pages deleted due to failure to flush data to disk in the last 10 minutes, metric data were lost to unblock data collection, please reduce disk load or use faster disks + to: sysadmin -- cgit v1.2.3