summaryrefslogtreecommitdiffstats
path: root/health/health.d/dbengine.conf
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--health/health.d/dbengine.conf50
1 files changed, 50 insertions, 0 deletions
diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf
new file mode 100644
index 0000000..274673e
--- /dev/null
+++ b/health/health.d/dbengine.conf
@@ -0,0 +1,50 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ alarm: 10min_dbengine_global_fs_errors
+ on: netdata.dbengine_global_errors
+ os: linux freebsd macos
+ hosts: *
+lookup: sum -10m unaligned of fs_errors
+ units: errors
+ every: 10s
+ crit: $this > 0
+ delay: down 15m multiplier 1.5 max 1h
+ info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc)
+ to: sysadmin
+
+ alarm: 10min_dbengine_global_io_errors
+ on: netdata.dbengine_global_errors
+ os: linux freebsd macos
+ hosts: *
+lookup: sum -10m unaligned of io_errors
+ units: errors
+ every: 10s
+ crit: $this > 0
+ delay: down 1h multiplier 1.5 max 3h
+ info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc)
+ to: sysadmin
+
+ alarm: 10min_dbengine_global_flushing_warnings
+ on: netdata.dbengine_global_errors
+ os: linux freebsd macos
+ hosts: *
+lookup: sum -10m unaligned of pg_cache_over_half_dirty_events
+ units: errors
+ every: 10s
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 3h
+ info: number of times in the last 10 minutes that dbengine dirty pages were over 50% of the instance's page cache, metric data at risk of not being stored in the database, please reduce disk load or use faster disks
+ to: sysadmin
+
+ alarm: 10min_dbengine_global_flushing_errors
+ on: netdata.dbengine_long_term_page_stats
+ os: linux freebsd macos
+ hosts: *
+lookup: sum -10m unaligned of flushing_pressure_deletions
+ units: pages
+ every: 10s
+ crit: $this != 0
+ delay: down 1h multiplier 1.5 max 3h
+ info: number of pages deleted due to failure to flush data to disk in the last 10 minutes, metric data were lost to unblock data collection, please reduce disk load or use faster disks
+ to: sysadmin