diff options
Diffstat (limited to 'health/health.d')
-rw-r--r-- | health/health.d/dbengine.conf | 44 | ||||
-rw-r--r-- | health/health.d/gearman.conf | 22 | ||||
-rw-r--r-- | health/health.d/hdfs.conf | 75 | ||||
-rw-r--r-- | health/health.d/mysql.conf | 34 | ||||
-rw-r--r-- | health/health.d/vcsa.conf | 122 | ||||
-rw-r--r-- | health/health.d/zookeeper.conf | 14 |
6 files changed, 289 insertions, 22 deletions
diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf index 956abf294..ce6427cd2 100644 --- a/health/health.d/dbengine.conf +++ b/health/health.d/dbengine.conf @@ -1,26 +1,26 @@ # you can disable an alarm notification by setting the 'to' line to: silent - alarm: 10min_dbengine_global_fs_errors - on: netdata.dbengine_global_errors - os: linux freebsd macos - hosts: * - lookup: sum -10m unaligned of FS errors - units: errors - every: 10s - crit: $this > 0 - delay: down 15m multiplier 1.5 max 1h - info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc) - to: sysadmin + alarm: 10min_dbengine_global_fs_errors + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * +lookup: sum -10m unaligned of FS errors + units: errors + every: 10s + crit: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc) + to: sysadmin - alarm: 10min_dbengine_global_io_errors - on: netdata.dbengine_global_errors - os: linux freebsd macos - hosts: * - lookup: sum -10m unaligned of I/O errors - units: errors - every: 10s - crit: $this > 0 - delay: down 1h multiplier 1.5 max 3h - info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc) - to: sysadmin + alarm: 10min_dbengine_global_io_errors + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * +lookup: sum -10m unaligned of I/O errors + units: errors + every: 10s + crit: $this > 0 + delay: down 1h multiplier 1.5 max 3h + info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc) + to: sysadmin
\ No newline at end of file diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf new file mode 100644 index 000000000..e3863ae5e --- /dev/null +++ b/health/health.d/gearman.conf @@ -0,0 +1,22 @@ +# make sure Gearman is running +template: gearman_last_collected_secs + on: gearman.total_jobs + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +template: gearman_workers_queued + on: gearman.single_job + lookup: average -10m unaligned match-names of Queued + units: workers + every: 10s + warn: $this > 30000 + crit: $this > 100000 + delay: down 5m multiplier 1.5 max 1h + info: number of queued jobs + to: sysadmin
\ No newline at end of file diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf new file mode 100644 index 000000000..678faab4c --- /dev/null +++ b/health/health.d/hdfs.conf @@ -0,0 +1,75 @@ + +# make sure hdfs is running + +template: hdfs_last_collected_secs + on: hdfs.heap_memory + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + + +# Common + +template: hdfs_capacity_usage + on: hdfs.capacity + calc: ($used) * 100 / ($used + $remaining) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: used capacity + to: sysadmin + + +# NameNode + +template: hdfs_missing_blocks + on: hdfs.blocks + calc: $missing + units: missing blocks + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: missing blocks + to: sysadmin + + +template: hdfs_stale_nodes + on: hdfs.data_nodes + calc: $stale + units: dead nodes + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: stale data nodes + to: sysadmin + + +template: hdfs_dead_nodes + on: hdfs.data_nodes + calc: $dead + units: dead nodes + every: 10s + crit: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: dead data nodes + to: sysadmin + + +# DataNode + +template: hdfs_num_failed_volumes + on: hdfs.num_failed_volumes + calc: $fsds_num_failed_volumes + units: failed volumes + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: failed volumes + to: sysadmin diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf index 39c401915..ce7b98a87 100644 --- a/health/health.d/mysql.conf +++ b/health/health.d/mysql.conf @@ -98,3 +98,37 @@ template: mysql_replication_lag info: the number of seconds mysql replication is behind this master to: dba + +# ----------------------------------------------------------------------------- +# galera cluster size + +template: mysql_galera_cluster_size_max_2m + on: mysql.galera_cluster_size + lookup: max -2m absolute + units: nodes + every: 10s + info: max cluster size 2 minute + to: dba + +template: mysql_galera_cluster_size + on: mysql.galera_cluster_size + calc: $nodes + units: nodes + every: 10s + warn: $this > $mysql_galera_cluster_size_max_2m + crit: $this < $mysql_galera_cluster_size_max_2m + delay: up 20s down 5m multiplier 1.5 max 1h + info: cluster size has changed + to: dba + +# galera node state + +template: mysql_galera_cluster_state + on: mysql.galera_cluster_state + calc: $state + every: 10s + warn: $this < 4 + crit: $this < 2 + delay: up 30s down 5m multiplier 1.5 max 1h + info: node state (0: undefined, 1: joining, 2: donor/desynced, 3: joined, 4: synced) + to: dba diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf new file mode 100644 index 000000000..7bb98a9ba --- /dev/null +++ b/health/health.d/vcsa.conf @@ -0,0 +1,122 @@ + +# make sure vcsa is running and responding + +template: vcsa_last_collected_secs + on: vcsa.system_health + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# Overall system health: +# - 0: all components are healthy. +# - 1: one or more components might become overloaded soon. +# - 2: one or more components in the appliance might be degraded. +# - 3: one or more components might be in an unusable status and the appliance might become unresponsive soon. +# - 4: no health data is available. + +template: vcsa_system_health + on: vcsa.system_health + lookup: max -10s unaligned of system + units: status + every: 10s + warn: ($this == 1) || ($this == 2) + crit: $this == 3 + delay: down 1m multiplier 1.5 max 1h + info: overall system health status + to: sysadmin + +# Components health: +# - 0: healthy. +# - 1: healthy, but may have some problems. +# - 2: degraded, and may have serious problems. +# - 3: unavailable, or will stop functioning soon. +# - 4: no health data is available. + +template: vcsa_swap_health + on: vcsa.components_health + lookup: max -10s unaligned of swap + units: status + every: 10s + warn: $this == 1 + crit: ($this == 2) || ($this == 3) + delay: down 1m multiplier 1.5 max 1h + info: swap health status + to: sysadmin + +template: vcsa_storage_health + on: vcsa.components_health + lookup: max -10s unaligned of storage + units: status + every: 10s + warn: $this == 1 + crit: ($this == 2) || ($this == 3) + delay: down 1m multiplier 1.5 max 1h + info: storage health status + to: sysadmin + +template: vcsa_mem_health + on: vcsa.components_health + lookup: max -10s unaligned of mem + units: status + every: 10s + warn: $this == 1 + crit: ($this == 2) || ($this == 3) + delay: down 1m multiplier 1.5 max 1h + info: mem health status + to: sysadmin + +template: vcsa_load_health + on: vcsa.components_health + lookup: max -10s unaligned of load + units: status + every: 10s + warn: $this == 1 + crit: ($this == 2) || ($this == 3) + delay: down 1m multiplier 1.5 max 1h + info: load health status + to: sysadmin + +template: vcsa_database_storage_health + on: vcsa.components_health + lookup: max -10s unaligned of database_storage + units: status + every: 10s + warn: $this == 1 + crit: ($this == 2) || ($this == 3) + delay: down 1m multiplier 1.5 max 1h + info: database storage health status + to: sysadmin + +template: vcsa_applmgmt_health + on: vcsa.components_health + lookup: max -10s unaligned of applmgmt + units: status + every: 10s + warn: $this == 1 + crit: ($this == 2) || ($this == 3) + delay: down 1m multiplier 1.5 max 1h + info: appl mgmt health status + to: sysadmin + + +# Software updates health: +# - 0: no updates available. +# - 2: non-security updates are available. +# - 3: security updates are available. +# - 4: an error retrieving information on software updates. + +template: vcsa_software_updates_health + on: vcsa.software_updates_health + lookup: max -10s unaligned of software_packages + units: status + every: 10s + warn: $this == 4 + crit: $this == 3 + delay: down 1m multiplier 1.5 max 1h + info: software packages health status + to: sysadmin diff --git a/health/health.d/zookeeper.conf b/health/health.d/zookeeper.conf new file mode 100644 index 000000000..ffbe31baf --- /dev/null +++ b/health/health.d/zookeeper.conf @@ -0,0 +1,14 @@ + +# make sure zookeeper is running + +template: zookeeper_last_collected_secs + on: zookeeper.requests + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + |