diff options
Diffstat (limited to 'health/health.d')
84 files changed, 4162 insertions, 0 deletions
diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf new file mode 100644 index 0000000..a1301ce --- /dev/null +++ b/health/health.d/adaptec_raid.conf @@ -0,0 +1,24 @@ + +# logical device status check + +template: adapter_raid_ld_status + on: adapter_raid.ld_status + lookup: max -5s + units: bool + every: 10s + crit: $this > 0 + delay: down 5m multiplier 1.5 max 1h + info: at least 1 logical device is failed or degraded + to: sysadmin + +# physical device state check + +template: adapter_raid_pd_state + on: adapter_raid.pd_state + lookup: max -5s + units: bool + every: 10s + crit: $this > 0 + delay: down 5m multiplier 1.5 max 1h + info: at least 1 physical device is not in online state + to: sysadmin diff --git a/health/health.d/am2320.conf b/health/health.d/am2320.conf new file mode 100644 index 0000000..ddf8b70 --- /dev/null +++ b/health/health.d/am2320.conf @@ -0,0 +1,12 @@ +# make sure am2320 is sending stats + +template: am2320_last_collected_secs + on: am2320.temperature + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster
\ No newline at end of file diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf new file mode 100644 index 0000000..a2d248e --- /dev/null +++ b/health/health.d/anomalies.conf @@ -0,0 +1,17 @@ +# raise a warning alarm if an anomaly probability is consistently above 50% + +template: anomaly_probabilities + on: anomalies.probability + lookup: average -2m foreach * + every: 1m + warn: $this > 50 + info: average anomaly probability > 50% for last 2 minutes + +# raise a warning alarm if an anomaly flag is consistently firing + +template: anomaly_flags + on: anomalies.anomaly + lookup: sum -2m foreach * + every: 1m + warn: $this > 10 + info: count of anomalies > 10 for last 2 minutes diff --git a/health/health.d/apache.conf b/health/health.d/apache.conf new file mode 100644 index 0000000..0c98b87 --- /dev/null +++ b/health/health.d/apache.conf @@ -0,0 +1,14 @@ + +# make sure apache is running + +template: apache_last_collected_secs + on: apache.requests + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf new file mode 100644 index 0000000..4f86037 --- /dev/null +++ b/health/health.d/apcupsd.conf @@ -0,0 +1,40 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +template: 10min_ups_load + on: apcupsd.load + os: * + hosts: * + lookup: average -10m unaligned of percentage + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 10m multiplier 1.5 max 1h + info: average UPS load for the last 10 minutes + to: sitemgr + +# Discussion in https://github.com/netdata/netdata/pull/3928: +# Fire the alarm as soon as it's going on battery (99% charge) and clear only when full. +template: ups_charge + on: apcupsd.charge + os: * + hosts: * + lookup: average -60s unaligned of charge + units: % + every: 60s + warn: $this < 100 + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 10m multiplier 1.5 max 1h + info: current UPS charge, averaged over the last 60 seconds to reduce measurement errors + to: sitemgr + +template: apcupsd_last_collected_secs + on: apcupsd.load + calc: $now - $last_collected_t + every: 10s + units: seconds ago + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sitemgr diff --git a/health/health.d/apps_plugin.conf b/health/health.d/apps_plugin.conf new file mode 100644 index 0000000..9a27bc6 --- /dev/null +++ b/health/health.d/apps_plugin.conf @@ -0,0 +1,15 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# disabled due to https://github.com/netdata/netdata/issues/10327 +# +# alarm: used_file_descriptors +# on: apps.files +# hosts: * +# calc: $fdperc +# units: % +# every: 5s +# warn: $this > (($status >= $WARNING) ? (75) : (80)) +# crit: $this > (($status == $CRITICAL) ? (85) : (90)) +# delay: down 5m multiplier 1.5 max 1h +# info: Peak percentage of file descriptors used +# to: sysadmin diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf new file mode 100644 index 0000000..e51b8aa --- /dev/null +++ b/health/health.d/backend.conf @@ -0,0 +1,56 @@ +# Alert that backends subsystem will be disabled soon + alarm: backend_metrics_eol + on: netdata.backend_metrics + units: boolean + calc: $now - $last_collected_t + every: 1m + warn: $this > 0 + delay: down 5m multiplier 1.5 max 1h + info: The backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf. + to: sysadmin + +# make sure we are sending data to backend + + alarm: backend_last_buffering + on: netdata.backend_metrics + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful buffering of backend data + to: dba + + alarm: backend_metrics_sent + on: netdata.backend_metrics + units: % + calc: abs($sent) * 100 / abs($buffered) + every: 10s + warn: $this != 100 + delay: down 5m multiplier 1.5 max 1h + info: percentage of metrics sent to the backend server + to: dba + + alarm: backend_metrics_lost + on: netdata.backend_metrics + units: metrics + calc: abs($lost) + every: 10s + crit: ($this != 0) || ($status == $CRITICAL && abs($sent) == 0) + delay: down 5m multiplier 1.5 max 1h + info: number of metrics lost due to repeating failures to contact the backend server + to: dba + + +# this chart has been removed from netdata +# alarm: backend_slow +# on: netdata.backend_latency +# units: % +# calc: $latency * 100 / ($update_every * 1000) +# every: 10s +# warn: $this > 50 +# crit: $this > 100 +# delay: down 5m multiplier 1.5 max 1h +# info: the percentage of time between iterations needed by the backend time to process the data sent by netdata +# to: dba diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf new file mode 100644 index 0000000..f0da9ac --- /dev/null +++ b/health/health.d/bcache.conf @@ -0,0 +1,22 @@ + +template: bcache_cache_errors + on: disk.bcache_cache_read_races + lookup: sum -10m unaligned absolute + units: errors + every: 1m + warn: $this > 0 + crit: $this > ( ($status >= $CRITICAL) ? (0) : (10) ) + delay: down 1h multiplier 1.5 max 2h + info: the number of times bcache had issues using the cache, during the last 10 mins (this usually means your SSD cache is failing) + to: sysadmin + +template: bcache_cache_dirty + on: disk.bcache_cache_alloc + calc: $dirty + $metadata + $undefined + units: % + every: 1m + warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) ) + crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) ) + delay: up 1m down 1h multiplier 1.5 max 2h + info: the percentage of cache space used for dirty and metadata (this usually means your SSD cache is too small) + to: sysadmin diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf new file mode 100644 index 0000000..30dc273 --- /dev/null +++ b/health/health.d/beanstalkd.conf @@ -0,0 +1,36 @@ +# get the number of buried jobs in all queues + +template: server_buried_jobs + on: beanstalk.current_jobs + calc: $buried + units: jobs + every: 10s + warn: $this > 0 + crit: $this > 10 + delay: up 0 down 5m multiplier 1.2 max 1h + info: the number of buried jobs aggregated across all tubes + to: sysadmin + +# get the number of buried jobs per queue + +#template: tube_buried_jobs +# on: beanstalk.jobs +# calc: $buried +# units: jobs +# every: 10s +# warn: $this > 0 +# crit: $this > 10 +# delay: up 0 down 5m multiplier 1.2 max 1h +# info: the number of jobs buried per tube +# to: sysadmin + +# get the current number of tubes + +#template: number_of_tubes +# on: beanstalk.current_tubes +# calc: $tubes +# every: 10s +# warn: $this < 5 +# delay: up 0 down 5m multiplier 1.2 max 1h +# info: the current number of tubes on the server +# to: sysadmin diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf new file mode 100644 index 0000000..4145e77 --- /dev/null +++ b/health/health.d/bind_rndc.conf @@ -0,0 +1,9 @@ + template: bind_rndc_stats_file_size + on: bind_rndc.stats_size + units: megabytes + every: 60 + calc: $stats_size + warn: $this > 512 + crit: $this > 1024 + info: Bind stats file is very large! Consider to create logrotate conf file for it! + to: sysadmin diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf new file mode 100644 index 0000000..43c588d --- /dev/null +++ b/health/health.d/boinc.conf @@ -0,0 +1,62 @@ +# Alarms for various BOINC issues. + +# Warn on any compute errors encountered. +template: boinc_compute_errors + on: boinc.states + os: * + hosts: * +families: * + lookup: average -10m unaligned of comperror + units: tasks + every: 1m + warn: $this > 0 + crit: $this > 1 + delay: up 1m down 5m multiplier 1.5 max 1h + info: the total number of compute errors over the past 10 minutes + to: sysadmin + +# Warn on lots of upload errors +template: boinc_upload_errors + on: boinc.states + os: * + hosts: * +families: * + lookup: average -10m unaligned of upload_failed + units: tasks + every: 1m + warn: $this > 0 + crit: $this > 1 + delay: up 1m down 5m multiplier 1.5 max 1h + info: the average number of failed uploads over the past 10 minutes + to: sysadmin + +# Warn on the task queue being empty +template: boinc_total_tasks + on: boinc.tasks + os: * + hosts: * +families: * + lookup: average -10m unaligned of total + units: tasks + every: 1m + warn: $this < 1 + crit: $this < 0.1 + delay: up 5m down 10m multiplier 1.5 max 1h + info: the total number of locally available tasks + to: sysadmin + +# Warn on no active tasks with a non-empty queue +template: boinc_active_tasks + on: boinc.tasks + os: * + hosts: * +families: * + lookup: average -10m unaligned of active + calc: ($boinc_total_tasks >= 1) ? ($this) : (inf) + units: tasks + every: 1m + warn: $this < 1 + crit: $this < 0.1 + delay: up 5m down 10m multiplier 1.5 max 1h + info: the total number of active tasks + to: sysadmin diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf new file mode 100644 index 0000000..b27aa54 --- /dev/null +++ b/health/health.d/btrfs.conf @@ -0,0 +1,57 @@ + +template: btrfs_allocated + on: btrfs.disk + os: * + hosts: * +families: * + calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free)) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (90) : (95)) + crit: $this > (($status == $CRITICAL) ? (95) : (98)) + delay: up 1m down 15m multiplier 1.5 max 1h + info: the percentage of allocated BTRFS physical disk space + to: sysadmin + +template: btrfs_data + on: btrfs.data + os: * + hosts: * +families: * + calc: $used * 100 / ($used + $free) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98 + crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98 + delay: up 1m down 15m multiplier 1.5 max 1h + info: the percentage of used BTRFS data space + to: sysadmin + +template: btrfs_metadata + on: btrfs.metadata + os: * + hosts: * +families: * + calc: ($used + $reserved) * 100 / ($used + $free + $reserved) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98 + crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98 + delay: up 1m down 15m multiplier 1.5 max 1h + info: the percentage of used BTRFS metadata space + to: sysadmin + +template: btrfs_system + on: btrfs.system + os: * + hosts: * +families: * + calc: $used * 100 / ($used + $free) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98 + crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98 + delay: up 1m down 15m multiplier 1.5 max 1h + info: the percentage of used BTRFS system space + to: sysadmin + diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf new file mode 100644 index 0000000..de16f7b --- /dev/null +++ b/health/health.d/ceph.conf @@ -0,0 +1,13 @@ +# low ceph disk available + +template: cluster_space_usage + on: ceph.general_usage + calc: $avail * 100 / ($avail + $used) + units: % + every: 10s + warn: $this < 10 + crit: $this < 1 + delay: down 5m multiplier 1.2 max 1h + info: ceph disk usage is almost full + to: sysadmin + diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf new file mode 100644 index 0000000..79ece53 --- /dev/null +++ b/health/health.d/cgroups.conf @@ -0,0 +1,41 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +template: cgroup_10min_cpu_usage + on: cgroup.cpu_limit + os: linux + hosts: * + lookup: average -10m unaligned + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + info: cpu utilization for the last 10 minutes + to: sysadmin + +template: cgroup_ram_in_use + on: cgroup.mem_usage + os: linux + hosts: * + calc: ($ram) * 100 / $memory_limit + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: RAM used by cgroup + to: sysadmin + +template: cgroup_ram_and_swap_in_use + on: cgroup.mem_usage + os: linux + hosts: * + calc: ($ram + $swap) * 100 / $memory_and_swap_limit + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: RAM and Swap used by cgroup + to: sysadmin diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf new file mode 100644 index 0000000..8ab2c9d --- /dev/null +++ b/health/health.d/cockroachdb.conf @@ -0,0 +1,91 @@ + +# Availability + +template: cockroachdb_last_collected_secs + on: cockroachdb.live_nodes + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: dba + +# Capacity + +template: cockroachdb_used_storage_capacity + on: cockroachdb.storage_used_capacity_percentage + calc: $capacity_used_percent + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + info: entire disk usage percentage + to: dba + +template: cockroachdb_used_usable_storage_capacity + on: cockroachdb.storage_used_capacity_percentage + calc: $capacity_usable_used_percent + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + info: usable space usage percentage + to: dba + +# Replication + +template: cockroachdb_unavailable_ranges + on: cockroachdb.ranges_replication_problem + calc: $ranges_unavailable + units: num + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: number of ranges with fewer live replicas than the replication target + to: dba + +template: cockroachdb_replicas_leaders_not_leaseholders + on: cockroachdb.replicas_leaders + calc: $replicas_leaders_not_leaseholders + units: num + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: number of replicas that are Raft leaders whose range lease is held by another store + to: dba + +# FD + +template: cockroachdb_open_file_descriptors_limit + on: cockroachdb.process_file_descriptors + calc: $sys_fd_open/$sys_fd_softlimit * 100 + units: % + every: 10s + warn: $this > 80 + delay: down 15m multiplier 1.5 max 1h + info: open file descriptors usage percentage + to: dba + +# SQL + +template: cockroachdb_sql_active_connections + on: cockroachdb.sql_connections + calc: $sql_conns + units: active connections + every: 10s + info: number of active SQL connections + to: dba + +template: cockroachdb_sql_executed_statements_total_last_5m + on: cockroachdb.sql_statements_total + lookup: sum -5m absolute of sql_query_count + units: statements + every: 10s + warn: $this == 0 AND $cockroachdb_sql_active_connections != 0 + delay: down 15m up 30s multiplier 1.5 max 1h + info: number of executed SQL statements in the last 5 minutes + to: dba diff --git a/health/health.d/couchdb.conf b/health/health.d/couchdb.conf new file mode 100644 index 0000000..4a28952 --- /dev/null +++ b/health/health.d/couchdb.conf @@ -0,0 +1,13 @@ + +# make sure couchdb is running + +template: couchdb_last_collected_secs + on: couchdb.request_methods + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: dba diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf new file mode 100644 index 0000000..fa81898 --- /dev/null +++ b/health/health.d/cpu.conf @@ -0,0 +1,55 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +template: 10min_cpu_usage + on: system.cpu + os: linux + hosts: * + lookup: average -10m unaligned of user,system,softirq,irq,guest + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + info: average cpu utilization for the last 10 minutes (excluding iowait, nice and steal) + to: sysadmin + +template: 10min_cpu_iowait + on: system.cpu + os: linux + hosts: * + lookup: average -10m unaligned of iowait + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (20) : (40)) + crit: $this > (($status == $CRITICAL) ? (40) : (50)) + delay: down 15m multiplier 1.5 max 1h + info: average CPU wait I/O for the last 10 minutes + to: sysadmin + +template: 20min_steal_cpu + on: system.cpu + os: linux + hosts: * + lookup: average -20m unaligned of steal + units: % + every: 5m + warn: $this > (($status >= $WARNING) ? (5) : (10)) + crit: $this > (($status == $CRITICAL) ? (20) : (30)) + delay: down 1h multiplier 1.5 max 2h + info: average CPU steal time for the last 20 minutes + to: sysadmin + +## FreeBSD +template: 10min_cpu_usage + on: system.cpu + os: freebsd + hosts: * + lookup: average -10m unaligned of user,system,interrupt + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + info: average cpu utilization for the last 10 minutes (excluding nice) + to: sysadmin diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf new file mode 100644 index 0000000..274673e --- /dev/null +++ b/health/health.d/dbengine.conf @@ -0,0 +1,50 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: 10min_dbengine_global_fs_errors + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * +lookup: sum -10m unaligned of fs_errors + units: errors + every: 10s + crit: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc) + to: sysadmin + + alarm: 10min_dbengine_global_io_errors + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * +lookup: sum -10m unaligned of io_errors + units: errors + every: 10s + crit: $this > 0 + delay: down 1h multiplier 1.5 max 3h + info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc) + to: sysadmin + + alarm: 10min_dbengine_global_flushing_warnings + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * +lookup: sum -10m unaligned of pg_cache_over_half_dirty_events + units: errors + every: 10s + warn: $this > 0 + delay: down 1h multiplier 1.5 max 3h + info: number of times in the last 10 minutes that dbengine dirty pages were over 50% of the instance's page cache, metric data at risk of not being stored in the database, please reduce disk load or use faster disks + to: sysadmin + + alarm: 10min_dbengine_global_flushing_errors + on: netdata.dbengine_long_term_page_stats + os: linux freebsd macos + hosts: * +lookup: sum -10m unaligned of flushing_pressure_deletions + units: pages + every: 10s + crit: $this != 0 + delay: down 1h multiplier 1.5 max 3h + info: number of pages deleted due to failure to flush data to disk in the last 10 minutes, metric data were lost to unblock data collection, please reduce disk load or use faster disks + to: sysadmin diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf new file mode 100644 index 0000000..9c194ce --- /dev/null +++ b/health/health.d/disks.conf @@ -0,0 +1,167 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + +# ----------------------------------------------------------------------------- +# low disk space + +# checking the latest collected values +# raise an alarm if the disk is low on +# available disk space + +template: disk_space_usage + on: disk.space + os: linux freebsd + hosts: * +families: !/dev !/dev/* !/run !/run/* * + calc: $used * 100 / ($avail + $used) + units: % + every: 1m + warn: $this > (($status >= $WARNING ) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: up 1m down 15m multiplier 1.5 max 1h + info: current disk space usage + to: sysadmin + +template: disk_inode_usage + on: disk.inodes + os: linux freebsd + hosts: * +families: !/dev !/dev/* !/run !/run/* * + calc: $used * 100 / ($avail + $used) + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: up 1m down 15m multiplier 1.5 max 1h + info: current disk inode usage + to: sysadmin + + +# ----------------------------------------------------------------------------- +# disk fill rate + +# calculate the rate the disk fills +# use as base, the available space change +# during the last hour + +# this is just a calculation - it has no alarm +# we will use it in the next template to find +# the hours remaining + +template: disk_fill_rate + on: disk.space + os: linux freebsd + hosts: * +families: * + lookup: min -10m at -50m unaligned of avail + calc: ($this - $avail) / (($now - $after) / 3600) + every: 1m + units: GB/hour + info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour + + +# calculate the hours remaining +# if the disk continues to fill +# in this rate + +template: out_of_disk_space_time + on: disk.space + os: linux freebsd + hosts: * +families: * + calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) + units: hours + every: 10s + warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) + crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) + delay: down 15m multiplier 1.2 max 1h + info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour + to: sysadmin + + +# ----------------------------------------------------------------------------- +# disk inode fill rate + +# calculate the rate the disk inodes are allocated +# use as base, the available inodes change +# during the last hour + +# this is just a calculation - it has no alarm +# we will use it in the next template to find +# the hours remaining + +template: disk_inode_rate + on: disk.inodes + os: linux freebsd + hosts: * +families: * + lookup: min -10m at -50m unaligned of avail + calc: ($this - $avail) / (($now - $after) / 3600) + every: 1m + units: inodes/hour + info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour + +# calculate the hours remaining +# if the disk inodes are allocated +# in this rate + +template: out_of_disk_inodes_time + on: disk.inodes + os: linux freebsd + hosts: * +families: * + calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf) + units: hours + every: 10s + warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) + crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) + delay: down 15m multiplier 1.2 max 1h + info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour + to: sysadmin + + +# ----------------------------------------------------------------------------- +# disk congestion + +# raise an alarm if the disk is congested +# by calculating the average disk utilization +# for the last 10 minutes + +template: 10min_disk_utilization + on: disk.util + os: linux freebsd + hosts: * +families: * + lookup: average -10m unaligned + units: % + every: 1m + green: 90 + red: 98 + warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1)) + crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1)) + delay: down 15m multiplier 1.2 max 1h + info: the percentage of time the disk was busy, during the last 10 minutes + to: sysadmin + + +# raise an alarm if the disk backlog +# is above 1000ms (1s) per second +# for 10 minutes +# (i.e. the disk cannot catch up) + +template: 10min_disk_backlog + on: disk.backlog + os: linux + hosts: * +families: * + lookup: average -10m unaligned + units: ms + every: 1m + green: 2000 + red: 5000 + warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1)) + crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1)) + delay: down 15m multiplier 1.2 max 1h + info: average of the kernel estimated disk backlog, for the last 10 minutes + to: sysadmin diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf new file mode 100644 index 0000000..113c950 --- /dev/null +++ b/health/health.d/dns_query.conf @@ -0,0 +1,12 @@ + +# detect dns query failure + +template: dns_query_time_query_time + on: dns_query_time.query_time + lookup: average -10s unaligned foreach * + units: ms + every: 10s + warn: $this == nan + delay: up 20s down 5m multiplier 1.5 max 1h + info: query round trip time + to: sysadmin diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf new file mode 100644 index 0000000..ecf3b84 --- /dev/null +++ b/health/health.d/dnsmasq_dhcp.conf @@ -0,0 +1,12 @@ +# dhcp-range utilization + +template: dnsmasq_dhcp_dhcp_range_utilization + on: dnsmasq_dhcp.dhcp_range_utilization + every: 10s + units: % + calc: $used + warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) ) + crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) ) + delay: down 5m + info: dhcp-range utilization above threshold! + to: sysadmin diff --git a/health/health.d/dockerd.conf b/health/health.d/dockerd.conf new file mode 100644 index 0000000..729906c --- /dev/null +++ b/health/health.d/dockerd.conf @@ -0,0 +1,8 @@ +template: docker_unhealthy_containers + on: docker.unhealthy_containers + units: unhealthy containers + every: 10s + lookup: average -10s + crit: $this > 0 + info: number of unhealthy containers + to: sysadmin diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf new file mode 100644 index 0000000..f442344 --- /dev/null +++ b/health/health.d/elasticsearch.conf @@ -0,0 +1,12 @@ + +# make sure elasticsearch is running + +template: elasticsearch_last_collected + on: elasticsearch.cluster_health_status + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + info: number of seconds since the last successful data collection + to: sysadmin diff --git a/health/health.d/entropy.conf b/health/health.d/entropy.conf new file mode 100644 index 0000000..66d44ec --- /dev/null +++ b/health/health.d/entropy.conf @@ -0,0 +1,16 @@ + +# check if entropy is too low +# the alarm is checked every 1 minute +# and examines the last hour of data + + alarm: lowest_entropy + on: system.entropy + os: linux + hosts: * + lookup: min -10m unaligned + units: entries + every: 5m + warn: $this < (($status >= $WARNING) ? (200) : (100)) + delay: down 1h multiplier 1.5 max 2h + info: minimum entries in the random numbers pool in the last 10 minutes + to: silent diff --git a/health/health.d/exporting.conf b/health/health.d/exporting.conf new file mode 100644 index 0000000..506cb0c --- /dev/null +++ b/health/health.d/exporting.conf @@ -0,0 +1,34 @@ + +template: exporting_last_buffering +families: * + on: exporting_data_size + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful buffering of exporting data + to: dba + +template: exporting_metrics_sent +families: * + on: exporting_data_size + units: % + calc: abs($sent) * 100 / abs($buffered) + every: 10s + warn: $this != 100 + delay: down 5m multiplier 1.5 max 1h + info: percentage of metrics sent to the external database server + to: dba + +template: exporting_metrics_lost +families: * + on: exporting_data_size + units: metrics + calc: abs($lost) + every: 10s + crit: ($this != 0) || ($status == $CRITICAL && abs($sent) == 0) + delay: down 5m multiplier 1.5 max 1h + info: number of metrics lost due to repeating failures to contact the external database server + to: dba diff --git a/health/health.d/fping.conf b/health/health.d/fping.conf new file mode 100644 index 0000000..43658fe --- /dev/null +++ b/health/health.d/fping.conf @@ -0,0 +1,53 @@ + +template: fping_last_collected_secs +families: * + on: fping.latency + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +template: host_reachable +families: * + on: fping.latency + calc: $average != nan + units: up/down + every: 10s + crit: $this == 0 + info: states if the remote host is reachable + delay: down 30m multiplier 1.5 max 2h + to: sysadmin + +template: host_latency +families: * + on: fping.latency + lookup: average -10s unaligned of average + units: ms + every: 10s + green: 500 + red: 1000 + warn: $this > $green OR $max > $red + crit: $this > $red + info: average round trip delay during the last 10 seconds + delay: down 30m multiplier 1.5 max 2h + to: sysadmin + +template: packet_loss +families: * + on: fping.quality + lookup: average -10m unaligned of returned + calc: 100 - $this + green: 1 + red: 10 + units: % + every: 10s + warn: $this > $green + crit: $this > $red + info: packet loss percentage + delay: down 30m multiplier 1.5 max 2h + to: sysadmin + diff --git a/health/health.d/fronius.conf b/health/health.d/fronius.conf new file mode 100644 index 0000000..cdf6c8f --- /dev/null +++ b/health/health.d/fronius.conf @@ -0,0 +1,11 @@ +template: fronius_last_collected_secs +families: * + on: fronius.power + calc: $now - $last_collected_t + every: 10s + units: seconds ago + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sitemgr diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf new file mode 100644 index 0000000..e3863ae --- /dev/null +++ b/health/health.d/gearman.conf @@ -0,0 +1,22 @@ +# make sure Gearman is running +template: gearman_last_collected_secs + on: gearman.total_jobs + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +template: gearman_workers_queued + on: gearman.single_job + lookup: average -10m unaligned match-names of Queued + units: workers + every: 10s + warn: $this > 30000 + crit: $this > 100000 + delay: down 5m multiplier 1.5 max 1h + info: number of queued jobs + to: sysadmin
\ No newline at end of file diff --git a/health/health.d/haproxy.conf b/health/health.d/haproxy.conf new file mode 100644 index 0000000..e49c70d --- /dev/null +++ b/health/health.d/haproxy.conf @@ -0,0 +1,27 @@ +template: haproxy_backend_server_status + on: haproxy_hs.down + units: failed servers + every: 10s + lookup: average -10s + crit: $this > 0 + info: number of failed haproxy backend servers + to: sysadmin + +template: haproxy_backend_status + on: haproxy_hb.down + units: failed backend + every: 10s + lookup: average -10s + crit: $this > 0 + info: number of failed haproxy backends + to: sysadmin + +template: haproxy_last_collected + on: haproxy_hb.down + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + info: number of seconds since the last successful data collection + to: sysadmin diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf new file mode 100644 index 0000000..678faab --- /dev/null +++ b/health/health.d/hdfs.conf @@ -0,0 +1,75 @@ + +# make sure hdfs is running + +template: hdfs_last_collected_secs + on: hdfs.heap_memory + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + + +# Common + +template: hdfs_capacity_usage + on: hdfs.capacity + calc: ($used) * 100 / ($used + $remaining) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: used capacity + to: sysadmin + + +# NameNode + +template: hdfs_missing_blocks + on: hdfs.blocks + calc: $missing + units: missing blocks + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: missing blocks + to: sysadmin + + +template: hdfs_stale_nodes + on: hdfs.data_nodes + calc: $stale + units: dead nodes + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: stale data nodes + to: sysadmin + + +template: hdfs_dead_nodes + on: hdfs.data_nodes + calc: $dead + units: dead nodes + every: 10s + crit: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: dead data nodes + to: sysadmin + + +# DataNode + +template: hdfs_num_failed_volumes + on: hdfs.num_failed_volumes + calc: $fsds_num_failed_volumes + units: failed volumes + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: failed volumes + to: sysadmin diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf new file mode 100644 index 0000000..0ddf35e --- /dev/null +++ b/health/health.d/httpcheck.conf @@ -0,0 +1,99 @@ +template: httpcheck_last_collected_secs +families: * + on: httpcheck.status + calc: $now - $last_collected_t + every: 10s + units: seconds ago + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges +template: web_service_up +families: * + on: httpcheck.status + lookup: average -1m unaligned percentage of success + calc: ($this < 75) ? (0) : ($this) + every: 5s + units: up/down + info: at least 75% verified responses during last 60 seconds, ideal for badges + to: silent + +template: web_service_bad_content +families: * + on: httpcheck.status + lookup: average -5m unaligned percentage of bad_content + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + info: average of unexpected http response content during the last 5 minutes + options: no-clear-notification + to: webmaster + +template: web_service_bad_status +families: * + on: httpcheck.status + lookup: average -5m unaligned percentage of bad_status + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + info: average of unexpected http status during the last 5 minutes + options: no-clear-notification + to: webmaster + +template: web_service_timeouts +families: * + on: httpcheck.status + lookup: average -5m unaligned percentage of timeout + every: 10s + units: % + info: average of timeouts during the last 5 minutes + +template: no_web_service_connections +families: * + on: httpcheck.status + lookup: average -5m unaligned percentage of no_connection + every: 10s + units: % + info: average of failed requests during the last 5 minutes + +# combined timeout & no connection alarm +template: web_service_unreachable +families: * + on: httpcheck.status + calc: ($no_web_service_connections >= $web_service_timeouts) ? ($no_web_service_connections) : ($web_service_timeouts) + units: % + every: 10s + warn: ($no_web_service_connections >= 10 OR $web_service_timeouts >= 10) AND ($no_web_service_connections < 40 OR $web_service_timeouts < 40) + crit: $no_web_service_connections >= 40 OR $web_service_timeouts >= 40 + delay: down 5m multiplier 1.5 max 1h + info: average of failed requests either due to timeouts or no connection during the last 5 minutes + options: no-clear-notification + to: webmaster + +template: 1h_web_service_response_time +families: * + on: httpcheck.responsetime + lookup: average -1h unaligned of time + every: 30s + units: ms + info: average response time over the last hour + +template: web_service_slow +families: * + on: httpcheck.responsetime + lookup: average -3m unaligned of time + units: ms + every: 10s + warn: ($this > ($1h_web_service_response_time * 2) ) + crit: ($this > ($1h_web_service_response_time * 3) ) + info: average response time over the last 3 minutes, compared to the average over the last hour + delay: down 5m multiplier 1.5 max 1h + options: no-clear-notification + to: webmaster diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf new file mode 100644 index 0000000..59a5c8e --- /dev/null +++ b/health/health.d/ioping.conf @@ -0,0 +1,13 @@ +template: disk_latency +families: * + on: ioping.latency + lookup: average -10s unaligned of average + units: ms + every: 10s + green: 500 + red: 1000 + warn: $this > $green OR $max > $red + crit: $this > $red + info: average round trip delay during the last 10 seconds + delay: down 30m multiplier 1.5 max 2h + to: sysadmin diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf new file mode 100644 index 0000000..989d6e9 --- /dev/null +++ b/health/health.d/ipc.conf @@ -0,0 +1,28 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: semaphores_used + on: system.ipc_semaphores + os: linux + hosts: * + calc: $semaphores * 100 / $ipc_semaphores_max + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (70) : (90)) + delay: down 5m multiplier 1.5 max 1h + info: the percentage of IPC semaphores used + to: sysadmin + + alarm: semaphore_arrays_used + on: system.ipc_semaphore_arrays + os: linux + hosts: * + calc: $arrays * 100 / $ipc_semaphores_arrays_max + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (70) : (90)) + delay: down 5m multiplier 1.5 max 1h + info: the percentage of IPC semaphore arrays used + to: sysadmin diff --git a/health/health.d/ipfs.conf b/health/health.d/ipfs.conf new file mode 100644 index 0000000..3f77572 --- /dev/null +++ b/health/health.d/ipfs.conf @@ -0,0 +1,11 @@ + +template: ipfs_datastore_usage + on: ipfs.repo_size + calc: $size * 100 / $avail + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: ipfs Datastore close to running out of space + to: sysadmin diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf new file mode 100644 index 0000000..c255819 --- /dev/null +++ b/health/health.d/ipmi.conf @@ -0,0 +1,20 @@ + alarm: ipmi_sensors_states + on: ipmi.sensors_states + calc: $warning + $critical + units: sensors + every: 10s + warn: $this > 0 + crit: $critical > 0 + delay: up 5m down 15m multiplier 1.5 max 1h + info: the number IPMI sensors in non-nominal state + to: sysadmin + + alarm: ipmi_events + on: ipmi.events + calc: $events + units: events + every: 10s + warn: $this > 0 + delay: up 5m down 15m multiplier 1.5 max 1h + info: the number of events in the IPMI System Event Log (SEL) + to: sysadmin diff --git a/health/health.d/isc_dhcpd.conf b/health/health.d/isc_dhcpd.conf new file mode 100644 index 0000000..8054656 --- /dev/null +++ b/health/health.d/isc_dhcpd.conf @@ -0,0 +1,10 @@ + template: isc_dhcpd_leases_size + on: isc_dhcpd.leases_total + units: KB + every: 60 + calc: $leases_size + warn: $this > 3072 + crit: $this > 6144 + delay: up 2m down 5m + info: dhcpd.leases file too big! Module can slow down your server. + to: sysadmin diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf new file mode 100644 index 0000000..d2ef24b --- /dev/null +++ b/health/health.d/kubelet.conf @@ -0,0 +1,115 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# ----------------------------------------------------------------------------- + +# True (1) if the node is experiencing a configuration-related error, false (0) otherwise. + + template: node_config_error + on: k8s_kubelet.kubelet_node_config_error + calc: $kubelet_node_config_error + units: bool + every: 10s + warn: $this == 1 + delay: down 1m multiplier 1.5 max 2h + info: the node is experiencing a configuration-related error + to: sysadmin + +# Failed Token() requests to the alternate token source + + template: token_requests + lookup: sum -10s of token_fail_count + on: k8s_kubelet.kubelet_token_requests + units: failed requests + every: 10s + warn: $this > 0 + delay: down 1m multiplier 1.5 max 2h + info: failed token requests to alternate token source + to: sysadmin + +# Docker and runtime operation errors + + template: kubelet_operations_error + lookup: sum -1m + on: k8s_kubelet.kubelet_operations_errors + units: errors + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (20)) + delay: up 30s down 1m multiplier 1.5 max 2h + info: operations error + to: sysadmin + +# ----------------------------------------------------------------------------- + +# Pod Lifecycle Event Generator Relisting Latency + +# 1. calculate the pleg relisting latency for 1m (quantile 0.5, quantile 0.9, quantile 0.99) +# 2. do the same for the last 10s +# 3. raise an alarm if the later is: +# - 2x the first for quantile 0.5 +# - 4x the first for quantile 0.9 +# - 8x the first for quantile 0.99 +# +# we assume the minimum latency is 1000 microseconds + +# quantile 0.5 + +template: 1m_kubelet_pleg_relist_latency_quantile_05 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + lookup: average -1m unaligned of kubelet_pleg_relist_latency_05 + units: microseconds + every: 10s + info: the average value of pleg relisting latency during the last minute (quantile 0.5) + +template: 10s_kubelet_pleg_relist_latency_quantile_05 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + lookup: average -10s unaligned of kubelet_pleg_relist_latency_05 + calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_05 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_05)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(100):(200)) + crit: $this > (($status >= $WARNING)?(200):(400)) + delay: down 1m multiplier 1.5 max 2h + info: the % of the pleg relisting latency in the last 10 seconds, compared to the last minute (quantile 0.5) + to: sysadmin + +# quantile 0.9 + +template: 1m_kubelet_pleg_relist_latency_quantile_09 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + lookup: average -1m unaligned of kubelet_pleg_relist_latency_09 + units: microseconds + every: 10s + info: the average value of pleg relisting latency during the last minute (quantile 0.9) + +template: 10s_kubelet_pleg_relist_latency_quantile_09 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + lookup: average -10s unaligned of kubelet_pleg_relist_latency_09 + calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_09 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_09)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(200):(400)) + crit: $this > (($status >= $WARNING)?(400):(800)) + delay: down 1m multiplier 1.5 max 2h + info: the % of the pleg relisting latency in the last 10 seconds, compared to the last minute (quantile 0.9) + to: sysadmin + +# quantile 0.99 + +template: 1m_kubelet_pleg_relist_latency_quantile_099 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + lookup: average -1m unaligned of kubelet_pleg_relist_latency_099 + units: microseconds + every: 10s + info: the average value of pleg relisting latency during the last minute (quantile 0.99) + +template: 10s_kubelet_pleg_relist_latency_quantile_099 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + lookup: average -10s unaligned of kubelet_pleg_relist_latency_099 + calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_099 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_099)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(400):(800)) + crit: $this > (($status >= $WARNING)?(800):(1200)) + delay: down 1m multiplier 1.5 max 2h + info: the % of the pleg relisting latency in the last 10 seconds, compared to the last minute (quantile 0.99) + to: sysadmin diff --git a/health/health.d/lighttpd.conf b/health/health.d/lighttpd.conf new file mode 100644 index 0000000..915907a --- /dev/null +++ b/health/health.d/lighttpd.conf @@ -0,0 +1,14 @@ + +# make sure lighttpd is running + +template: lighttpd_last_collected_secs + on: lighttpd.requests + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf new file mode 100644 index 0000000..38727be --- /dev/null +++ b/health/health.d/linux_power_supply.conf @@ -0,0 +1,12 @@ +# Alert on low battery capacity. + +template: linux_power_supply_capacity + on: powersupply.capacity + calc: $capacity + units: % + every: 10s + warn: $this < 10 + crit: $this < 5 + delay: up 30s down 5m multiplier 1.2 max 1h + info: the percentage remaining capacity of the power supply + to: sysadmin diff --git a/health/health.d/load.conf b/health/health.d/load.conf new file mode 100644 index 0000000..ee0c54b --- /dev/null +++ b/health/health.d/load.conf @@ -0,0 +1,56 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +# Calculate the base trigger point for the load average alarms. +# This is the maximum number of CPU's in the system over the past 1 +# minute, with a special case for a single CPU of setting the trigger at 2. + alarm: load_trigger + on: system.load + os: linux + hosts: * + calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors ) + units: cpus + every: 1m + info: trigger point for load average alarms + +# Send alarms if the load average is unusually high. +# These intentionally _do not_ calculate the average over the sampled +# time period because the values being checked already are averages. + alarm: load_average_15 + on: system.load + os: linux + hosts: * + lookup: max -1m unaligned of load15 + units: load + every: 1m + warn: $this > (($status >= $WARNING) ? (1.75 * $load_trigger) : (2 * $load_trigger)) + crit: $this > (($status == $CRITICAL) ? (3.5 * $load_trigger) : (4 * $load_trigger)) + delay: down 15m multiplier 1.5 max 1h + info: fifteen-minute load average + to: sysadmin + + alarm: load_average_5 + on: system.load + os: linux + hosts: * + lookup: max -1m unaligned of load5 + units: load + every: 1m + warn: $this > (($status >= $WARNING) ? (3.5 * $load_trigger) : (4 * $load_trigger)) + crit: $this > (($status == $CRITICAL) ? (7 * $load_trigger) : (8 * $load_trigger)) + delay: down 15m multiplier 1.5 max 1h + info: five-minute load average + to: sysadmin + + alarm: load_average_1 + on: system.load + os: linux + hosts: * + lookup: max -1m unaligned of load1 + units: load + every: 1m + warn: $this > (($status >= $WARNING) ? (7 * $load_trigger) : (8 * $load_trigger)) + crit: $this > (($status == $CRITICAL) ? (14 * $load_trigger) : (16 * $load_trigger)) + delay: down 15m multiplier 1.5 max 1h + info: one-minute load average + to: sysadmin diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf new file mode 100644 index 0000000..2f906e1 --- /dev/null +++ b/health/health.d/mdstat.conf @@ -0,0 +1,38 @@ +template: mdstat_last_collected + on: md.disks + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + info: number of seconds since the last successful data collection + to: sysadmin + +template: mdstat_disks + on: md.disks + units: failed devices + every: 10s + calc: $down + crit: $this > 0 + info: Array is degraded! + to: sysadmin + +template: mdstat_mismatch_cnt + on: md.mismatch_cnt + units: unsynchronized blocks + calc: $count + every: 60s + warn: $this > 1024 + delay: up 30m + info: Mismatch count! + to: sysadmin + +template: mdstat_nonredundant_last_collected + on: md.nonredundant + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + info: number of seconds since the last successful data collection + to: sysadmin
\ No newline at end of file diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf new file mode 100644 index 0000000..6e81a2a --- /dev/null +++ b/health/health.d/megacli.conf @@ -0,0 +1,48 @@ +template: adapter_state + on: megacli.adapter_degraded + units: is degraded + lookup: sum -10s + every: 10s + crit: $this > 0 + info: adapter state + to: sysadmin + +template: bbu_relative_charge + on: megacli.bbu_relative_charge + units: percent + lookup: average -10s + every: 10s + warn: $this <= (($status >= $WARNING) ? (85) : (80)) + crit: $this <= (($status == $CRITICAL) ? (50) : (40)) + info: BBU relative state of charge + to: sysadmin + +template: bbu_cycle_count + on: megacli.bbu_cycle_count + units: cycle count + lookup: average -10s + every: 10s + warn: $this >= 100 + crit: $this >= 500 + info: BBU cycle count + to: sysadmin + +template: pd_media_errors + on: megacli.pd_media_error + units: media errors + lookup: sum -10s + every: 10s + warn: $this > 0 + delay: down 1m multiplier 2 max 10m + info: physical drive media errors + to: sysadmin + +template: pd_predictive_failures + on: megacli.pd_predictive_failure + units: predictive failures + lookup: sum -10s + every: 10s + warn: $this > 0 + delay: down 1m multiplier 2 max 10m + info: physical drive predictive failures + to: sysadmin diff --git a/health/health.d/memcached.conf b/health/health.d/memcached.conf new file mode 100644 index 0000000..d248ef5 --- /dev/null +++ b/health/health.d/memcached.conf @@ -0,0 +1,52 @@ + +# make sure memcached is running + +template: memcached_last_collected_secs + on: memcached.cache + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: dba + + +# detect if memcached cache is full + +template: memcached_cache_memory_usage + on: memcached.cache + calc: $used * 100 / ($used + $available) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (90)) + delay: up 0 down 15m multiplier 1.5 max 1h + info: current cache memory usage + to: dba + + +# find the rate memcached cache is filling + +template: cache_fill_rate + on: memcached.cache + lookup: min -10m at -50m unaligned of available + calc: ($this - $available) / (($now - $after) / 3600) + units: KB/hour + every: 1m + info: average rate the cache fills up (positive), or frees up (negative) space, for the last hour + + +# find the hours remaining until memcached cache is full + +template: out_of_cache_space_time + on: memcached.cache + calc: ($cache_fill_rate > 0) ? ($available / $cache_fill_rate) : (inf) + units: hours + every: 10s + warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) + crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) + delay: down 15m multiplier 1.5 max 1h + info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last hour + to: dba diff --git a/health/health.d/memory.conf b/health/health.d/memory.conf new file mode 100644 index 0000000..4a0e6e5 --- /dev/null +++ b/health/health.d/memory.conf @@ -0,0 +1,38 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: 1hour_ecc_memory_correctable + on: mem.ecc_ce + os: linux + hosts: * + lookup: sum -10m unaligned + units: errors + every: 1m + warn: $this > 0 + delay: down 1h multiplier 1.5 max 1h + info: number of ECC correctable errors during the last hour + to: sysadmin + + alarm: 1hour_ecc_memory_uncorrectable + on: mem.ecc_ue + os: linux + hosts: * + lookup: sum -10m unaligned + units: errors + every: 1m + crit: $this > 0 + delay: down 1h multiplier 1.5 max 1h + info: number of ECC uncorrectable errors during the last hour + to: sysadmin + + alarm: 1hour_memory_hw_corrupted + on: mem.hwcorrupt + os: linux + hosts: * + calc: $HardwareCorrupted + units: MB + every: 10s + warn: $this > 0 + delay: down 1h multiplier 1.5 max 1h + info: amount of memory corrupted due to a hardware failure + to: sysadmin diff --git a/health/health.d/mongodb.conf b/health/health.d/mongodb.conf new file mode 100644 index 0000000..a80cb31 --- /dev/null +++ b/health/health.d/mongodb.conf @@ -0,0 +1,13 @@ + +# make sure mongodb is running + +template: mongodb_last_collected_secs + on: mongodb.read_operations + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: dba diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf new file mode 100644 index 0000000..62cef5a --- /dev/null +++ b/health/health.d/mysql.conf @@ -0,0 +1,146 @@ + +# make sure mysql is running + +template: mysql_last_collected_secs + on: mysql.queries + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: dba + + +# ----------------------------------------------------------------------------- +# slow queries + +template: mysql_10s_slow_queries + on: mysql.queries + lookup: sum -10s of slow_queries + units: slow queries + every: 10s + warn: $this > (($status >= $WARNING) ? (5) : (10)) + crit: $this > (($status == $CRITICAL) ? (10) : (20)) + delay: down 5m multiplier 1.5 max 1h + info: number of mysql slow queries over the last 10 seconds + to: dba + + +# ----------------------------------------------------------------------------- +# lock waits + +template: mysql_10s_table_locks_immediate + on: mysql.table_locks + lookup: sum -10s absolute of immediate + units: immediate locks + every: 10s + info: number of table immediate locks over the last 10 seconds + to: dba + +template: mysql_10s_table_locks_waited + on: mysql.table_locks + lookup: sum -10s absolute of waited + units: waited locks + every: 10s + info: number of table waited locks over the last 10 seconds + to: dba + +template: mysql_10s_waited_locks_ratio + on: mysql.table_locks + calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0 + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (10) : (25)) + crit: $this > (($status == $CRITICAL) ? (25) : (50)) + delay: down 30m multiplier 1.5 max 1h + info: the ratio of mysql waited table locks, for the last 10 seconds + to: dba + + +# ----------------------------------------------------------------------------- +# connections + +template: mysql_connections + on: mysql.connections_active + calc: $active * 100 / $limit + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (60) : (70)) + crit: $this > (($status == $CRITICAL) ? (80) : (90)) + delay: down 15m multiplier 1.5 max 1h + info: the ratio of current active connections vs the maximum possible number of connections + to: dba + + +# ----------------------------------------------------------------------------- +# replication + +template: mysql_replication + on: mysql.slave_status + calc: ($sql_running <= 0 OR $io_running <= 0)?0:1 + units: ok/failed + every: 10s + crit: $this == 0 + delay: down 5m multiplier 1.5 max 1h + info: checks if mysql replication has stopped + to: dba + +template: mysql_replication_lag + on: mysql.slave_behind + calc: $seconds + units: seconds + every: 10s + warn: $this > (($status >= $WARNING) ? (5) : (10)) + crit: $this > (($status == $CRITICAL) ? (10) : (30)) + delay: down 15m multiplier 1.5 max 1h + info: the number of seconds mysql replication is behind this master + to: dba + + +# ----------------------------------------------------------------------------- +# galera cluster size + +template: mysql_galera_cluster_size_max_2m + on: mysql.galera_cluster_size + lookup: max -2m absolute + units: nodes + every: 10s + info: max cluster size 2 minute + to: dba + +template: mysql_galera_cluster_size + on: mysql.galera_cluster_size + calc: $nodes + units: nodes + every: 10s + warn: $this > $mysql_galera_cluster_size_max_2m + crit: $this < $mysql_galera_cluster_size_max_2m + delay: up 20s down 5m multiplier 1.5 max 1h + info: cluster size has changed + to: dba + +# galera node state + +template: mysql_galera_cluster_state + on: mysql.galera_cluster_state + calc: $state + every: 10s + warn: $this < 4 + crit: $this < 2 + delay: up 30s down 5m multiplier 1.5 max 1h + info: node state (0: undefined, 1: joining, 2: donor/desynced, 3: joined, 4: synced) + to: dba + + +# galera node status + +template: mysql_galera_cluster_status + on: mysql.galera_cluster_status + calc: $wsrep_cluster_status + every: 10s + crit: $mysql_galera_cluster_state != nan AND $this != 0 + delay: up 30s down 5m multiplier 1.5 max 1h + info: node and cluster status (-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected) + to: dba diff --git a/health/health.d/named.conf b/health/health.d/named.conf new file mode 100644 index 0000000..4fc65c8 --- /dev/null +++ b/health/health.d/named.conf @@ -0,0 +1,14 @@ + +# make sure named is running + +template: named_last_collected_secs + on: named.global_queries + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: domainadmin + diff --git a/health/health.d/net.conf b/health/health.d/net.conf new file mode 100644 index 0000000..261290e --- /dev/null +++ b/health/health.d/net.conf @@ -0,0 +1,195 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +# ----------------------------------------------------------------------------- +# net traffic overflow + + template: interface_speed + on: net.net + os: * + hosts: * + families: * + calc: ( $nic_speed_max > 0 ) ? ( $nic_speed_max) : ( nan ) + units: Mbit + every: 10s + info: The current speed of the physical network interface + + template: 1m_received_traffic_overflow + on: net.net + os: linux + hosts: * + families: * + lookup: average -1m unaligned absolute of received + calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan ) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (90)) + delay: down 1m multiplier 1.5 max 1h + info: interface received bandwidth usage over net device speed max + to: sysadmin + + template: 1m_sent_traffic_overflow + on: net.net + os: linux + hosts: * + families: * + lookup: average -1m unaligned absolute of sent + calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan ) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (90)) + delay: down 1m multiplier 1.5 max 1h + info: interface sent bandwidth usage over net device speed max + to: sysadmin + +# ----------------------------------------------------------------------------- +# dropped packets + +# check if an interface is dropping packets +# the alarm is checked every 1 minute +# and examines the last 10 minutes of data +# +# it is possible to have expected packet drops on an interface for some network configurations +# look at the Monitoring Network Interfaces section in the proc.plugin documentation for more information + +template: inbound_packets_dropped + on: net.drops + os: linux + hosts: * +families: * + lookup: sum -10m unaligned absolute of inbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface inbound dropped packets in the last 10 minutes + to: sysadmin + +template: outbound_packets_dropped + on: net.drops + os: linux + hosts: * +families: * + lookup: sum -10m unaligned absolute of outbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface outbound dropped packets in the last 10 minutes + to: sysadmin + +template: inbound_packets_dropped_ratio + on: net.packets + os: linux + hosts: * +families: * + lookup: sum -10m unaligned absolute of received + calc: (($inbound_packets_dropped != nan AND $this > 0) ? ($inbound_packets_dropped * 100 / $this) : (0)) + units: % + every: 1m + warn: $this >= 0.1 + crit: $this >= 2 + delay: down 1h multiplier 1.5 max 2h + info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes + to: sysadmin + +template: outbound_packets_dropped_ratio + on: net.packets + os: linux + hosts: * +families: * + lookup: sum -10m unaligned absolute of sent + calc: (($outbound_packets_dropped != nan AND $this > 0) ? ($outbound_packets_dropped * 100 / $this) : (0)) + units: % + every: 1m + warn: $this >= 0.1 + crit: $this >= 2 + delay: down 1h multiplier 1.5 max 2h + info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes + to: sysadmin + +# ----------------------------------------------------------------------------- +# interface errors + +template: interface_inbound_errors + on: net.errors + os: freebsd + hosts: * +families: * + lookup: sum -10m unaligned absolute of inbound + units: errors + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface inbound errors in the last 10 minutes + to: sysadmin + +template: interface_outbound_errors + on: net.errors + os: freebsd + hosts: * +families: * + lookup: sum -10m unaligned absolute of outbound + units: errors + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface outbound errors in the last 10 minutes + to: sysadmin + +# ----------------------------------------------------------------------------- +# FIFO errors + +# check if an interface is having FIFO +# buffer errors +# the alarm is checked every 1 minute +# and examines the last 10 minutes of data + +template: 10min_fifo_errors + on: net.fifo + os: linux + hosts: * +families: * + lookup: sum -10m unaligned absolute + units: errors + every: 1m + warn: $this > 0 + delay: down 1h multiplier 1.5 max 2h + info: interface fifo errors in the last 10 minutes + to: sysadmin + +# ----------------------------------------------------------------------------- +# check for packet storms + +# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate +# 2. do the same for the last 10s +# 3. raise an alarm if the later is 10x or 20x the first +# we assume the minimum packet storm should at least have +# 10000 packets/s, average of the last 10 seconds + +template: 1m_received_packets_rate + on: net.packets + os: linux freebsd + hosts: * +families: * + lookup: average -1m unaligned of received + units: packets + every: 10s + info: the average number of packets received during the last minute + +template: 10s_received_packets_storm + on: net.packets + os: linux freebsd + hosts: * +families: * + lookup: average -10s unaligned of received + calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(200):(5000)) + crit: $this > (($status >= $WARNING)?(5000):(6000)) + options: no-clear-notification + info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent) + to: sysadmin diff --git a/health/health.d/netfilter.conf b/health/health.d/netfilter.conf new file mode 100644 index 0000000..1d07752 --- /dev/null +++ b/health/health.d/netfilter.conf @@ -0,0 +1,29 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: netfilter_last_collected_secs + on: netfilter.conntrack_sockets + os: linux + hosts: * + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + + alarm: netfilter_conntrack_full + on: netfilter.conntrack_sockets + os: linux + hosts: * + lookup: max -10s unaligned of connections + calc: $this * 100 / $netfilter_conntrack_max + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (90)) + delay: down 5m multiplier 1.5 max 1h + info: the number of connections tracked by the netfilter connection tracker, as a percentage of the connection tracker table size + to: sysadmin diff --git a/health/health.d/nginx.conf b/health/health.d/nginx.conf new file mode 100644 index 0000000..a686c3d --- /dev/null +++ b/health/health.d/nginx.conf @@ -0,0 +1,14 @@ + +# make sure nginx is running + +template: nginx_last_collected_secs + on: nginx.requests + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + diff --git a/health/health.d/nginx_plus.conf b/health/health.d/nginx_plus.conf new file mode 100644 index 0000000..5a171a7 --- /dev/null +++ b/health/health.d/nginx_plus.conf @@ -0,0 +1,14 @@ + +# make sure nginx_plus is running + +template: nginx_plus_last_collected_secs + on: nginx_plus.requests_total + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + diff --git a/health/health.d/phpfpm.conf b/health/health.d/phpfpm.conf new file mode 100644 index 0000000..ec7ae74 --- /dev/null +++ b/health/health.d/phpfpm.conf @@ -0,0 +1,14 @@ + +# make sure phpfpm is running + +template: phpfpm_last_collected_secs + on: phpfpm.requests + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf new file mode 100644 index 0000000..b255d35 --- /dev/null +++ b/health/health.d/pihole.conf @@ -0,0 +1,65 @@ + +# Make sure Pi-hole is responding. + +template: pihole_last_collected_secs + on: pihole.dns_queries_total + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + +# Blocked DNS queries. + +template: pihole_blocked_queries + on: pihole.dns_queries_percentage + every: 10s + units: % + calc: $blocked + warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) ) + crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) ) + delay: up 2m down 5m + info: percentage of blocked dns queries for the last 24 hour + to: sysadmin + + +# Blocklist last update time. +# Default update interval is a week. + +template: pihole_blocklist_last_update + on: pihole.blocklist_last_update + every: 10s + units: seconds + calc: $ago + warn: $this > 60 * 60 * 24 * 8 + crit: $this > 60 * 60 * 24 * 8 * 2 + info: blocklist last update time + to: sysadmin + +# Gravity file check (gravity.list). + +template: pihole_blocklist_gravity_file + on: pihole.blocklist_last_update + every: 10s + units: boolean + calc: $file_exists + crit: $this != 1 + delay: up 2m down 5m + info: gravity file existence + to: sysadmin + +# Pi-hole's ability to block unwanted domains. +# Should be enabled. The whole point of Pi-hole! + +template: pihole_status + on: pihole.unwanted_domains_blocking_status + every: 10s + units: boolean + calc: $enabled + warn: $this != 1 + delay: up 2m down 5m + info: unwanted domains blocking status + to: sysadmin diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf new file mode 100644 index 0000000..696333f --- /dev/null +++ b/health/health.d/portcheck.conf @@ -0,0 +1,46 @@ +template: portcheck_last_collected_secs +families: * + on: portcheck.status + calc: $now - $last_collected_t + every: 10s + units: seconds ago + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges +template: service_reachable +families: * + on: portcheck.status + lookup: average -1m unaligned percentage of success + calc: ($this < 75) ? (0) : ($this) + every: 5s + units: up/down + info: at least 75% successful connections during last 60 seconds, ideal for badges + to: silent + +template: connection_timeouts +families: * + on: portcheck.status + lookup: average -5m unaligned percentage of timeout + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + info: average of timeouts during the last 5 minutes + to: sysadmin + +template: connection_fails +families: * + on: portcheck.status + lookup: average -5m unaligned percentage of no_connection,failed + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + info: average of failed connections during the last 5 minutes + to: sysadmin diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf new file mode 100644 index 0000000..4e0583b --- /dev/null +++ b/health/health.d/postgres.conf @@ -0,0 +1,13 @@ + +# make sure postgres is running + +template: postgres_last_collected_secs + on: postgres.db_stat_transactions + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: dba diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf new file mode 100644 index 0000000..293f1aa --- /dev/null +++ b/health/health.d/processes.conf @@ -0,0 +1,13 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: active_processes + on: system.active_processes + hosts: * + calc: $active * 100 / $pidmax + units: % + every: 5s + warn: $this > (($status >= $WARNING) ? (75) : (80)) + crit: $this > (($status == $CRITICAL) ? (85) : (90)) + delay: down 5m multiplier 1.5 max 1h + info: the percentage of active processes + to: sysadmin diff --git a/health/health.d/pulsar.conf b/health/health.d/pulsar.conf new file mode 100644 index 0000000..0147894 --- /dev/null +++ b/health/health.d/pulsar.conf @@ -0,0 +1,13 @@ + +# Availability + +template: pulsar_last_collected_secs + on: pulsar.broker_components + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin diff --git a/health/health.d/qos.conf b/health/health.d/qos.conf new file mode 100644 index 0000000..7290d15 --- /dev/null +++ b/health/health.d/qos.conf @@ -0,0 +1,18 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +# check if a QoS class is dropping packets +# the alarm is checked every 10 seconds +# and examines the last minute of data + +#template: 10min_qos_packet_drops +# on: tc.qos_dropped +# os: linux +# hosts: * +# lookup: sum -10m unaligned absolute +# every: 30s +# warn: $this > 0 +# delay: up 0 down 30m multiplier 1.5 max 1h +# units: packets +# info: dropped packets in the last 30 minutes +# to: sysadmin diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf new file mode 100644 index 0000000..0a71dac --- /dev/null +++ b/health/health.d/ram.conf @@ -0,0 +1,64 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: used_ram_to_ignore + on: system.ram + os: linux freebsd + hosts: * + calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz - $zfs.arc_size.min) + every: 10s + info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC) + + alarm: ram_in_use + on: system.ram + os: linux + hosts: * +# calc: $used * 100 / ($used + $cached + $free) + calc: ($used - $used_ram_to_ignore) * 100 / ($used + $cached + $free) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: system RAM used + to: sysadmin + + alarm: ram_available + on: mem.available + os: linux + hosts: * + calc: ($avail + $system.ram.used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers) + units: % + every: 10s + warn: $this < (($status >= $WARNING) ? (15) : (10)) + crit: $this < (($status == $CRITICAL) ? (10) : ( 5)) + delay: down 15m multiplier 1.5 max 1h + info: estimated amount of RAM available for userspace processes, without causing swapping + to: sysadmin + +## FreeBSD + alarm: ram_in_use + on: system.ram + os: freebsd + hosts: * + calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: system RAM usage + to: sysadmin + + alarm: ram_available + on: system.ram + os: freebsd + hosts: * + calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers) + units: % + every: 10s + warn: $this < (($status >= $WARNING) ? (15) : (10)) + crit: $this < (($status == $CRITICAL) ? (10) : ( 5)) + delay: down 15m multiplier 1.5 max 1h + info: estimated amount of RAM available for userspace processes, without causing swapping + to: sysadmin diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf new file mode 100644 index 0000000..c08a884 --- /dev/null +++ b/health/health.d/redis.conf @@ -0,0 +1,34 @@ + +# make sure redis is running + +template: redis_last_collected_secs + on: redis.operations + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: dba + +template: redis_bgsave_broken +families: * + on: redis.bgsave_health + every: 10s + crit: $rdb_last_bgsave_status != 0 + units: ok/failed + info: states if redis bgsave is working + delay: down 5m multiplier 1.5 max 1h + to: dba + +template: redis_bgsave_slow +families: * + on: redis.bgsave_now + every: 10s + warn: $rdb_bgsave_in_progress > 600 + crit: $rdb_bgsave_in_progress > 1200 + units: seconds + info: the time redis needs to save its database + delay: down 5m multiplier 1.5 max 1h + to: dba diff --git a/health/health.d/retroshare.conf b/health/health.d/retroshare.conf new file mode 100644 index 0000000..2344b60 --- /dev/null +++ b/health/health.d/retroshare.conf @@ -0,0 +1,25 @@ +# make sure RetroShare is running + +template: retroshare_last_collected_secs + on: retroshare.peers + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# make sure the DHT is fine when active + +template: retroshare_dht_working + on: retroshare.dht + calc: $dht_size_all + units: peers + every: 1m + warn: $this < (($status >= $WARNING) ? (120) : (100)) + crit: $this < (($status == $CRITICAL) ? (10) : (1)) + delay: up 0 down 15m multiplier 1.5 max 1h + info: Checks if the DHT has enough peers to operate + to: sysadmin diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf new file mode 100644 index 0000000..7453027 --- /dev/null +++ b/health/health.d/riakkv.conf @@ -0,0 +1,80 @@ +# Ensure that Riak is running. template: riak_last_collected_secs +template: riak_last_collected_secs + on: riak.kv.throughput + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: dba + +# Warn if a list keys operation is running. +template: riak_list_keys_active + on: riak.core.fsm_active + calc: $list_fsm_active + units: state machines + every: 10s + warn: $list_fsm_active > 0 + info: number of currently running list keys finite state machines + to: dba + + +## Timing healthchecks +# KV GET +template: 1h_kv_get_mean_latency + on: riak.kv.latency.get + calc: $node_get_fsm_time_mean + lookup: average -1h unaligned of time + every: 30s + units: ms + info: mean average KV GET latency over the last hour + +template: riak_kv_get_slow + on: riak.kv.latency.get + calc: $mean + lookup: average -3m unaligned of time + units: ms + every: 10s + warn: ($this > ($1h_kv_get_mean_latency * 2) ) + crit: ($this > ($1h_kv_get_mean_latency * 3) ) + info: average KV GET time over the last 3 minutes, compared to the average over the last hour + delay: down 5m multiplier 1.5 max 1h + to: dba + +# KV PUT +template: 1h_kv_put_mean_latency + on: riak.kv.latency.put + calc: $node_put_fsm_time_mean + lookup: average -1h unaligned of time + every: 30s + units: ms + info: mean average KV PUT latency over the last hour + +template: riak_kv_put_slow + on: riak.kv.latency.put + calc: $mean + lookup: average -3m unaligned of time + units: ms + every: 10s + warn: ($this > ($1h_kv_put_mean_latency * 2) ) + crit: ($this > ($1h_kv_put_mean_latency * 3) ) + info: average KV PUT time over the last 3 minutes, compared to the average over the last hour + delay: down 5m multiplier 1.5 max 1h + to: dba + + +## VM healthchecks + +# Default Erlang VM process limit: 262144 +# On systems observed, this is < 2000, but may grow depending on load. +template: riak_vm_high_process_count + on: riak.vm + calc: $sys_process_count + units: processes + every: 10s + warn: $this > 10000 + crit: $this > 100000 + info: number of processes running in the Erlang VM (the default limit on ERTS 10.2.4 is 262144) + to: dba diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf new file mode 100644 index 0000000..1a3088a --- /dev/null +++ b/health/health.d/scaleio.conf @@ -0,0 +1,38 @@ + +# make sure scaleio is running + +template: scaleio_last_collected_secs + on: scaleio.system_capacity_total + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# make sure Storage Pool capacity utilization is under limit + +template: scaleio_storage_pool_capacity_utilization + on: scaleio.storage_pool_capacity_utilization + calc: $used + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: Storage Pool capacity utilization + to: sysadmin + + +# make sure Sdc is connected to MDM + +template: scaleio_sdc_mdm_connection_state + on: scaleio.sdc_mdm_connection_state + calc: $connected + every: 10s + warn: $this != 1 + delay: up 30s down 5m multiplier 1.5 max 1h + info: Sdc connection to MDM state + to: sysadmin diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf new file mode 100644 index 0000000..f835f2a --- /dev/null +++ b/health/health.d/softnet.conf @@ -0,0 +1,40 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +# check for common /proc/net/softnet_stat errors + + alarm: 1min_netdev_backlog_exceeded + on: system.softnet_stat + os: linux + hosts: * + lookup: average -1m unaligned absolute of dropped + units: packets + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (10)) + delay: down 1h multiplier 1.5 max 2h + info: average number of packets dropped in the last 1min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets) + to: sysadmin + + alarm: 1min_netdev_budget_ran_outs + on: system.softnet_stat + os: linux + hosts: * + lookup: average -1m unaligned absolute of squeezed + units: events + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (10)) + delay: down 1h multiplier 1.5 max 2h + info: average number of times, during the last 1min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets) + to: silent + + alarm: 10min_netisr_backlog_exceeded + on: system.softnet_stat + os: freebsd + hosts: * + lookup: average -1m unaligned absolute of qdrops + units: packets + every: 10s + warn: $this > (($status >+ $WARNING) ? (0) : (10)) + delay: down 1h multiplier 1.5 max 2h + info: average number of drops in the last 1min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets) + to: sysadmin diff --git a/health/health.d/squid.conf b/health/health.d/squid.conf new file mode 100644 index 0000000..06cc967 --- /dev/null +++ b/health/health.d/squid.conf @@ -0,0 +1,14 @@ + +# make sure squid is running + +template: squid_last_collected_secs + on: squid.clients_requests + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: proxyadmin + diff --git a/health/health.d/stiebeleltron.conf b/health/health.d/stiebeleltron.conf new file mode 100644 index 0000000..e0361eb --- /dev/null +++ b/health/health.d/stiebeleltron.conf @@ -0,0 +1,11 @@ +template: stiebeleltron_last_collected_secs +families: * + on: stiebeleltron.heating.hc1 + calc: $now - $last_collected_t + every: 10s + units: seconds ago + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sitemgr diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf new file mode 100644 index 0000000..f920b08 --- /dev/null +++ b/health/health.d/swap.conf @@ -0,0 +1,43 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: 30min_ram_swapped_out + on: system.swapio + os: linux freebsd + hosts: * + lookup: sum -30m unaligned absolute of out + # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024 + calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) + units: % of RAM + every: 1m + warn: $this > (($status >= $WARNING) ? (10) : (20)) + crit: $this > (($status == $CRITICAL) ? (20) : (30)) + delay: up 0 down 15m multiplier 1.5 max 1h + info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM + to: sysadmin + + alarm: ram_in_swap + on: system.swap + os: linux + hosts: * + calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) + units: % of RAM + every: 10s + warn: $this > (($status >= $WARNING) ? (15) : (20)) + crit: $this > (($status == $CRITICAL) ? (40) : (50)) + delay: up 30s down 15m multiplier 1.5 max 1h + info: the swap memory used, as a percentage of the system RAM + to: sysadmin + + alarm: used_swap + on: system.swap + os: linux freebsd + hosts: * + calc: $used * 100 / ( $used + $free ) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: up 30s down 15m multiplier 1.5 max 1h + info: the percentage of swap memory used + to: sysadmin diff --git a/health/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf new file mode 100644 index 0000000..7aa9a98 --- /dev/null +++ b/health/health.d/tcp_conn.conf @@ -0,0 +1,19 @@ + +# +# ${tcp_max_connections} may be nan or -1 if the system +# supports dynamic threshold for TCP connections. +# In this case, the alarm will always be zero. +# + + alarm: tcp_connections + on: ipv4.tcpsock + os: linux + hosts: * + calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0 + units: % + every: 10s + warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 )) + crit: $this > (($status >= $CRITICAL) ? ( 80 ) : ( 90 )) + delay: up 0 down 5m multiplier 1.5 max 1h + info: the percentage of IPv4 TCP connections over the max allowed + to: sysadmin diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf new file mode 100644 index 0000000..3b30725 --- /dev/null +++ b/health/health.d/tcp_listen.conf @@ -0,0 +1,83 @@ +# +# There are two queues involved when incoming TCP connections are handled +# (both at the kernel): +# +# SYN queue +# The SYN queue tracks TCP handshakes until connections are fully established. +# It overflows when too many incoming TCP connection requests hang in the +# half-open state and the server is not configured to fall back to SYN cookies. +# Overflows are usually caused by SYN flood DoS attacks (i.e. someone sends +# lots of SYN packets and never completes the handshakes). +# +# Accept queue +# The accept queue holds fully established TCP connections waiting to be handled +# by the listening application. It overflows when the server application fails +# to accept new connections at the rate they are coming in. +# +# +# ----------------------------------------------------------------------------- +# tcp accept queue (at the kernel) + + alarm: 1m_tcp_accept_queue_overflows + on: ip.tcp_accept_queue + os: linux + hosts: * + lookup: average -60s unaligned absolute of ListenOverflows + units: overflows + every: 10s + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (1) : (5)) + delay: up 0 down 5m multiplier 1.5 max 1h + info: the average number of times the TCP accept queue of the kernel overflown, during the last minute + to: sysadmin + +# THIS IS TOO GENERIC +# CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842 + alarm: 1m_tcp_accept_queue_drops + on: ip.tcp_accept_queue + os: linux + hosts: * + lookup: average -60s unaligned absolute of ListenDrops + units: drops + every: 10s + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (1) : (5)) + delay: up 0 down 5m multiplier 1.5 max 1h + info: the average number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received) + to: sysadmin + + +# ----------------------------------------------------------------------------- +# tcp SYN queue (at the kernel) + +# When the SYN queue is full, either TcpExtTCPReqQFullDoCookies or +# TcpExtTCPReqQFullDrop is incremented, depending on whether SYN cookies are +# enabled or not. In both cases this probably indicates a SYN flood attack, +# so i guess a notification should be sent. + + alarm: 1m_tcp_syn_queue_drops + on: ip.tcp_syn_queue + os: linux + hosts: * + lookup: average -60s unaligned absolute of TCPReqQFullDrop + units: drops + every: 10s + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (0) : (5)) + delay: up 10 down 5m multiplier 1.5 max 1h + info: the number of times the TCP SYN queue of the kernel was full and dropped packets, during the last minute + to: sysadmin + + alarm: 1m_tcp_syn_queue_cookies + on: ip.tcp_syn_queue + os: linux + hosts: * + lookup: average -60s unaligned absolute of TCPReqQFullDoCookies + units: cookies + every: 10s + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (0) : (5)) + delay: up 10 down 5m multiplier 1.5 max 1h + info: the number of times the TCP SYN queue of the kernel was full and sent SYN cookies, during the last minute + to: sysadmin + diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf new file mode 100644 index 0000000..6927d57 --- /dev/null +++ b/health/health.d/tcp_mem.conf @@ -0,0 +1,20 @@ +# +# check +# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html +# +# We give a warning when TCP is under memory pressure +# and a critical when TCP is 90% of its upper memory limit +# + + alarm: tcp_memory + on: ipv4.sockstat_tcp_mem + os: linux + hosts: * + calc: ${mem} * 100 / ${tcp_mem_high} + units: % + every: 10s + warn: ${mem} > (($status >= $WARNING ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure} )) + crit: ${mem} > (($status >= $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 )) + delay: up 0 down 5m multiplier 1.5 max 1h + info: the amount of TCP memory as a percentage of its max memory limit + to: sysadmin diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf new file mode 100644 index 0000000..280d659 --- /dev/null +++ b/health/health.d/tcp_orphans.conf @@ -0,0 +1,21 @@ + +# +# check +# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html +# +# The kernel may penalize orphans by 2x or even 4x +# so we alarm warning at 25% and critical at 50% +# + + alarm: tcp_orphans + on: ipv4.sockstat_tcp_sockets + os: linux + hosts: * + calc: ${orphan} * 100 / ${tcp_max_orphans} + units: % + every: 10s + warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 )) + crit: $this > (($status >= $CRITICAL) ? ( 25 ) : ( 50 )) + delay: up 0 down 5m multiplier 1.5 max 1h + info: the percentage of orphan IPv4 TCP sockets over the max allowed (this may lead to too-many-orphans errors) + to: sysadmin diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf new file mode 100644 index 0000000..36a550a --- /dev/null +++ b/health/health.d/tcp_resets.conf @@ -0,0 +1,67 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +# ----------------------------------------------------------------------------- + + alarm: ipv4_tcphandshake_last_collected_secs + on: ipv4.tcphandshake + os: linux freebsd + hosts: * + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: up 0 down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# ----------------------------------------------------------------------------- +# tcp resets this host sends + + alarm: 1m_ipv4_tcp_resets_sent + on: ipv4.tcphandshake + os: linux + hosts: * + lookup: average -1m at -10s unaligned absolute of OutRsts + units: tcp resets/s + every: 10s + info: average TCP RESETS this host is sending, over the last minute + + alarm: 10s_ipv4_tcp_resets_sent + on: ipv4.tcphandshake + os: linux + hosts: * + lookup: average -10s unaligned absolute of OutRsts + units: tcp resets/s + every: 10s + warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (20))) + delay: up 20s down 60m multiplier 1.2 max 2h + options: no-clear-notification + info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed; clear notification for this alarm will not be sent) + to: sysadmin + +# ----------------------------------------------------------------------------- +# tcp resets this host receives + + alarm: 1m_ipv4_tcp_resets_received + on: ipv4.tcphandshake + os: linux freebsd + hosts: * + lookup: average -1m at -10s unaligned absolute of AttemptFails + units: tcp resets/s + every: 10s + info: average TCP RESETS this host is sending, over the last minute + + alarm: 10s_ipv4_tcp_resets_received + on: ipv4.tcphandshake + os: linux freebsd + hosts: * + lookup: average -10s unaligned absolute of AttemptFails + units: tcp resets/s + every: 10s + warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10))) + delay: up 20s down 60m multiplier 1.2 max 2h + options: no-clear-notification + info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed; clear notification for this alarm will not be sent) + to: sysadmin diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf new file mode 100644 index 0000000..1e47b5c --- /dev/null +++ b/health/health.d/udp_errors.conf @@ -0,0 +1,49 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +# ----------------------------------------------------------------------------- + + alarm: ipv4_udperrors_last_collected_secs + on: ipv4.udperrors + os: linux freebsd + hosts: * + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: up 0 down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# ----------------------------------------------------------------------------- +# UDP receive buffer errors + + alarm: 1m_ipv4_udp_receive_buffer_errors + on: ipv4.udperrors + os: linux freebsd + hosts: * + lookup: average -1m unaligned absolute of RcvbufErrors + units: errors + every: 10s + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (0) : (10)) + info: average number of UDP receive buffer errors during the last minute + delay: up 0 down 60m multiplier 1.2 max 2h + to: sysadmin + +# ----------------------------------------------------------------------------- +# UDP send buffer errors + + alarm: 1m_ipv4_udp_send_buffer_errors + on: ipv4.udperrors + os: linux + hosts: * + lookup: average -1m unaligned absolute of SndbufErrors + units: errors + every: 10s + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (0) : (10)) + info: number of UDP send buffer errors during the last minute + delay: up 0 down 60m multiplier 1.2 max 2h + to: sysadmin diff --git a/health/health.d/unbound.conf b/health/health.d/unbound.conf new file mode 100644 index 0000000..bdedc11 --- /dev/null +++ b/health/health.d/unbound.conf @@ -0,0 +1,35 @@ + +# make sure unbound is running + +template: unbound_last_collected_secs + on: unbound.queries + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# make sure there is no overwritten/dropped queries in the request-list + +template: unbound_request_list_overwritten + on: unbound.request_list_jostle_list + lookup: average -60s unaligned absolute match-names of overwritten + units: queries + every: 10s + warn: $this > 5 + delay: up 10 down 5m multiplier 1.5 max 1h + info: the number of overwritten queries in the request-list + to: sysadmin + +template: unbound_request_list_dropped + on: unbound.request_list_jostle_list + lookup: average -60s unaligned absolute match-names of dropped + units: queries + every: 10s + warn: $this > 0 + delay: up 10 down 5m multiplier 1.5 max 1h + info: the number of dropped queries in the request-list + to: sysadmin diff --git a/health/health.d/varnish.conf b/health/health.d/varnish.conf new file mode 100644 index 0000000..cca7446 --- /dev/null +++ b/health/health.d/varnish.conf @@ -0,0 +1,9 @@ + alarm: varnish_last_collected + on: varnish.uptime + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + info: number of seconds since the last successful data collection + to: sysadmin diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf new file mode 100644 index 0000000..7bb98a9 --- /dev/null +++ b/health/health.d/vcsa.conf @@ -0,0 +1,122 @@ + +# make sure vcsa is running and responding + +template: vcsa_last_collected_secs + on: vcsa.system_health + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# Overall system health: +# - 0: all components are healthy. +# - 1: one or more components might become overloaded soon. +# - 2: one or more components in the appliance might be degraded. +# - 3: one or more components might be in an unusable status and the appliance might become unresponsive soon. +# - 4: no health data is available. + +template: vcsa_system_health + on: vcsa.system_health + lookup: max -10s unaligned of system + units: status + every: 10s + warn: ($this == 1) || ($this == 2) + crit: $this == 3 + delay: down 1m multiplier 1.5 max 1h + info: overall system health status + to: sysadmin + +# Components health: +# - 0: healthy. +# - 1: healthy, but may have some problems. +# - 2: degraded, and may have serious problems. +# - 3: unavailable, or will stop functioning soon. +# - 4: no health data is available. + +template: vcsa_swap_health + on: vcsa.components_health + lookup: max -10s unaligned of swap + units: status + every: 10s + warn: $this == 1 + crit: ($this == 2) || ($this == 3) + delay: down 1m multiplier 1.5 max 1h + info: swap health status + to: sysadmin + +template: vcsa_storage_health + on: vcsa.components_health + lookup: max -10s unaligned of storage + units: status + every: 10s + warn: $this == 1 + crit: ($this == 2) || ($this == 3) + delay: down 1m multiplier 1.5 max 1h + info: storage health status + to: sysadmin + +template: vcsa_mem_health + on: vcsa.components_health + lookup: max -10s unaligned of mem + units: status + every: 10s + warn: $this == 1 + crit: ($this == 2) || ($this == 3) + delay: down 1m multiplier 1.5 max 1h + info: mem health status + to: sysadmin + +template: vcsa_load_health + on: vcsa.components_health + lookup: max -10s unaligned of load + units: status + every: 10s + warn: $this == 1 + crit: ($this == 2) || ($this == 3) + delay: down 1m multiplier 1.5 max 1h + info: load health status + to: sysadmin + +template: vcsa_database_storage_health + on: vcsa.components_health + lookup: max -10s unaligned of database_storage + units: status + every: 10s + warn: $this == 1 + crit: ($this == 2) || ($this == 3) + delay: down 1m multiplier 1.5 max 1h + info: database storage health status + to: sysadmin + +template: vcsa_applmgmt_health + on: vcsa.components_health + lookup: max -10s unaligned of applmgmt + units: status + every: 10s + warn: $this == 1 + crit: ($this == 2) || ($this == 3) + delay: down 1m multiplier 1.5 max 1h + info: appl mgmt health status + to: sysadmin + + +# Software updates health: +# - 0: no updates available. +# - 2: non-security updates are available. +# - 3: security updates are available. +# - 4: an error retrieving information on software updates. + +template: vcsa_software_updates_health + on: vcsa.software_updates_health + lookup: max -10s unaligned of software_packages + units: status + every: 10s + warn: $this == 4 + crit: $this == 3 + delay: down 1m multiplier 1.5 max 1h + info: software packages health status + to: sysadmin diff --git a/health/health.d/vernemq.conf b/health/health.d/vernemq.conf new file mode 100644 index 0000000..36bbaf8 --- /dev/null +++ b/health/health.d/vernemq.conf @@ -0,0 +1,399 @@ + +# Availability + +template: vernemq_last_collected_secs + on: vernemq.node_uptime + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# Socket errors + +template: vernemq_socket_errors + on: vernemq.socket_errors + lookup: sum -1m unaligned absolute of socket_error + units: errors + every: 10s + warn: $this > (($status == $WARNING) ? (0) : (5)) + delay: down 5m multiplier 1.5 max 2h + info: socket errors in the last minute + to: sysadmin + +# Queues dropped/expired/unhandled PUBLISH messages + +template: vernemq_queue_message_drop + on: vernemq.queue_undelivered_messages + lookup: sum -1m unaligned absolute of queue_message_drop + units: dropped messages + every: 10s + warn: $this > (($status == $WARNING) ? (0) : (5)) + delay: down 5m multiplier 1.5 max 2h + info: dropped messaged due to full queues in the last minute + to: sysadmin + +template: vernemq_queue_message_expired + on: vernemq.queue_undelivered_messages + lookup: sum -1m unaligned absolute of queue_message_expired + units: expired messages + every: 10s + warn: $this > (($status == $WARNING) ? (0) : (15)) + delay: down 5m multiplier 1.5 max 2h + info: messages which expired before delivery in the last minute + to: sysadmin + +template: vernemq_queue_message_unhandled + on: vernemq.queue_undelivered_messages + lookup: sum -1m unaligned absolute of queue_message_unhandled + units: unhandled messages + every: 10s + warn: $this > (($status == $WARNING) ? (0) : (5)) + delay: down 5m multiplier 1.5 max 2h + info: unhandled messages (connections with clean session=true) in the last minute + to: sysadmin + +# Erlang VM + +template: vernemq_average_scheduler_utilization + on: vernemq.average_scheduler_utilization + lookup: average -10m unaligned + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + info: average scheduler utilization for the last 10 minutes + to: sysadmin + +# Cluster communication and netsplits + +template: vernemq_cluster_dropped + on: vernemq.cluster_dropped + lookup: average -1m unaligned + units: KiB/s + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 1h + info: the amount of traffic dropped during communication with the cluster nodes in the last minute + to: sysadmin + +template: vernemq_netsplits + on: vernemq.netsplits + lookup: sum -1m unaligned absolute of netsplit_detected + units: netsplits + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: detected netsplits in the last minute + to: sysadmin + +# Unsuccessful CONNACK + +template: vernemq_mqtt_connack_sent_reason_success + on: vernemq.mqtt_connack_sent_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v3/v5 CONNACK sent in the last minute + to: sysadmin + +template: vernemq_mqtt_connack_sent_reason_unsuccessful + on: vernemq.mqtt_connack_sent_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_connack_sent_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v3/v5 CONNACK sent in the last minute + to: sysadmin + +# Not normal DISCONNECT + +template: vernemq_mqtt_disconnect_received_reason_normal_disconnect + on: vernemq.mqtt_disconnect_received_reason + lookup: sum -1m unaligned absolute match-names of normal_disconnect + units: packets + every: 10s + info: normal v5 DISCONNECT received in the last minute + to: sysadmin + +template: vernemq_mqtt_disconnect_sent_reason_normal_disconnect + on: vernemq.mqtt_disconnect_sent_reason + lookup: sum -1m unaligned absolute match-names of normal_disconnect + units: packets + every: 10s + info: normal v5 DISCONNECT sent in the last minute + to: sysadmin + +template: vernemq_mqtt_disconnect_received_reason_not_normal + on: vernemq.mqtt_disconnect_received_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_disconnect_received_reason_normal_disconnect + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: not normal v5 DISCONNECT received in the last minute + to: sysadmin + +template: vernemq_mqtt_disconnect_sent_reason_not_normal + on: vernemq.mqtt_disconnect_sent_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_disconnect_sent_reason_normal_disconnect + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: not normal v5 DISCONNECT sent in the last minute + to: sysadmin + +# SUBSCRIBE errors and unauthorized attempts + +template: vernemq_mqtt_subscribe_error + on: vernemq.mqtt_subscribe_error + lookup: sum -1m unaligned absolute + units: failed ops + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: failed v3/v5 SUBSCRIBE operations in the last minute + to: sysadmin + +template: vernemq_mqtt_subscribe_auth_error + on: vernemq.mqtt_subscribe_auth_error + lookup: sum -1m unaligned absolute + units: attempts + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unauthorized v3/v5 SUBSCRIBE attempts in the last minute + to: sysadmin + +# UNSUBSCRIBE errors + +template: vernemq_mqtt_unsubscribe_error + on: vernemq.mqtt_unsubscribe_error + lookup: sum -1m unaligned absolute + units: failed ops + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: failed v3/v5 UNSUBSCRIBE operations in the last minute + to: sysadmin + +# PUBLISH errors and unauthorized attempts + +template: vernemq_mqtt_publish_errors + on: vernemq.mqtt_publish_errors + lookup: sum -1m unaligned absolute + units: failed ops + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: failed v3/v5 PUBLISH operations in the last minute + to: sysadmin + +template: vernemq_mqtt_publish_auth_errors + on: vernemq.mqtt_publish_auth_errors + lookup: sum -1m unaligned absolute + units: attempts + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unauthorized v3/v5 PUBLISH attempts in the last minute + to: sysadmin + +# Unsuccessful and unexpected PUBACK + +template: vernemq_mqtt_puback_received_reason_success + on: vernemq.mqtt_puback_received_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBACK received in the last minute + to: sysadmin + +template: vernemq_mqtt_puback_sent_reason_success + on: vernemq.mqtt_puback_sent_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBACK sent in the last minute + to: sysadmin + +template: vernemq_mqtt_puback_received_reason_unsuccessful + on: vernemq.mqtt_puback_received_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_puback_received_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBACK received in the last minute + to: sysadmin + +template: vernemq_mqtt_puback_sent_reason_unsuccessful + on: vernemq.mqtt_puback_sent_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_puback_sent_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBACK sent in the last minute + to: sysadmin + +template: vernemq_mqtt_puback_unexpected + on: vernemq.mqtt_puback_invalid_error + lookup: sum -1m unaligned absolute + units: messages + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unexpected v3/v5 PUBACK received in the last minute + to: sysadmin + +# Unsuccessful and unexpected PUBREC + +template: vernemq_mqtt_pubrec_received_reason_success + on: vernemq.mqtt_pubrec_received_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBREC received in the last minute + to: sysadmin + +template: vernemq_mqtt_pubrec_sent_reason_success + on: vernemq.mqtt_pubrec_sent_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBREC sent in the last minute + to: sysadmin + +template: vernemq_mqtt_pubrec_received_reason_unsuccessful + on: vernemq.mqtt_pubrec_received_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_pubrec_received_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBREC received in the last minute + to: sysadmin + +template: vernemq_mqtt_pubrec_sent_reason_unsuccessful + on: vernemq.mqtt_pubrec_sent_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_pubrec_sent_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBREC sent in the last minute + to: sysadmin + +template: vernemq_mqtt_pubrec_invalid_error + on: vernemq.mqtt_pubrec_invalid_error + lookup: sum -1m unaligned absolute + units: messages + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unexpected v3 PUBREC received in the last minute + to: sysadmin + +# Unsuccessful PUBREL + +template: vernemq_mqtt_pubrel_received_reason_success + on: vernemq.mqtt_pubrel_received_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBREL received in the last minute + to: sysadmin + +template: vernemq_mqtt_pubrel_sent_reason_success + on: vernemq.mqtt_pubrel_sent_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBREL sent in the last minute + to: sysadmin + +template: vernemq_mqtt_pubrel_received_reason_unsuccessful + on: vernemq.mqtt_pubrel_received_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_pubrel_received_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBREL received in the last minute + to: sysadmin + +template: vernemq_mqtt_pubrel_sent_reason_unsuccessful + on: vernemq.mqtt_pubrel_sent_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_pubrel_sent_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBREL sent in the last minute + to: sysadmin + +# Unsuccessful and unexpected PUBCOMP + +template: vernemq_mqtt_pubcomp_received_reason_success + on: vernemq.mqtt_pubcomp_received_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBCOMP received in the last minute + to: sysadmin + +template: vernemq_mqtt_pubcomp_sent_reason_success + on: vernemq.mqtt_pubcomp_sent_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBCOMP sent in the last minute + to: sysadmin + +template: vernemq_mqtt_pubcomp_received_reason_unsuccessful + on: vernemq.mqtt_pubcomp_received_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_pubcomp_received_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBCOMP received in the last minute + to: sysadmin + +template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful + on: vernemq.mqtt_pubcomp_sent_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_pubcomp_sent_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBCOMP sent in the last minute + to: sysadmin + +template: vernemq_mqtt_pubcomp_unexpected + on: vernemq.mqtt_pubcomp_invalid_error + lookup: sum -1m unaligned absolute + units: messages + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unexpected v3/v5 PUBCOMP received in the last minute + to: sysadmin diff --git a/health/health.d/vsphere.conf b/health/health.d/vsphere.conf new file mode 100644 index 0000000..d8b2be1 --- /dev/null +++ b/health/health.d/vsphere.conf @@ -0,0 +1,157 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +# -----------------------------------------------VM Specific------------------------------------------------------------ +# Memory + +template: vsphere_vm_mem_usage + on: vsphere.vm_mem_usage_percentage + hosts: * + calc: $used + units: % + every: 20s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: used RAM + +# -----------------------------------------------HOST Specific---------------------------------------------------------- +# Memory + +template: vsphere_host_mem_usage + on: vsphere.host_mem_usage_percentage + hosts: * + calc: $used + units: % + every: 20s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: used RAM + +# Network errors + +template: vsphere_inbound_packets_errors + on: vsphere.net_errors_total + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of rx + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface inbound dropped packets in the last 10 minutes + to: sysadmin + +template: vsphere_outbound_packets_errors + on: vsphere.net_errors_total + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of tx + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface outbound dropped packets in the last 10 minutes + to: sysadmin + +# Network errors ratio + +template: vsphere_inbound_packets_errors_ratio + on: vsphere.net_packets_total + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of rx + calc: (($vsphere_inbound_packets_errors != nan AND $this > 0) ? ($vsphere_inbound_packets_errors * 100 / $this) : (0)) + units: % + every: 1m + warn: $this >= 0.1 + crit: $this >= 2 + delay: down 1h multiplier 1.5 max 2h + info: the ratio of inbound errors vs the total number of received packets of the network interface, during the last 10 minutes + to: sysadmin + +template: vsphere_outbound_packets_errors_ratio + on: vsphere.net_packets_total + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of tx + calc: (($vsphere_outbound_packets_errors != nan AND $this > 0) ? ($vsphere_outbound_packets_errors * 100 / $this) : (0)) + units: % + every: 1m + warn: $this >= 0.1 + crit: $this >= 2 + delay: down 1h multiplier 1.5 max 2h + info: the ratio of outbound errors vs the total number of sent packets of the network interface, during the last 10 minutes + to: sysadmin + +# -----------------------------------------------Common------------------------------------------------------------------- +# CPU + +template: vsphere_cpu_usage + on: vsphere.cpu_usage_total + hosts: * + lookup: average -10m unaligned match-names of used + units: % + every: 20s + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + info: cpu utilization for the last 10 minutes + to: sysadmin + +# Network drops + +template: vsphere_inbound_packets_dropped + on: vsphere.net_drops_total + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of rx + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface inbound dropped packets in the last 10 minutes + to: sysadmin + +template: vsphere_outbound_packets_dropped + on: vsphere.net_drops_total + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of tx + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface outbound dropped packets in the last 10 minutes + to: sysadmin + +# Network drops ratio + +template: vsphere_inbound_packets_dropped_ratio + on: vsphere.net_packets_total + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of rx + calc: (($vsphere_inbound_packets_dropped != nan AND $this > 0) ? ($vsphere_inbound_packets_dropped * 100 / $this) : (0)) + units: % + every: 1m + warn: $this >= 0.1 + crit: $this >= 2 + delay: down 1h multiplier 1.5 max 2h + info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes + to: sysadmin + +template: vsphere_outbound_packets_dropped_ratio + on: vsphere.net_packets_total + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of tx + calc: (($vsphere_outbound_packets_dropped != nan AND $this > 0) ? ($vsphere_outbound_packets_dropped * 100 / $this) : (0)) + units: % + every: 1m + warn: $this >= 0.1 + crit: $this >= 2 + delay: down 1h multiplier 1.5 max 2h + info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes + to: sysadmin diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf new file mode 100644 index 0000000..44de38a --- /dev/null +++ b/health/health.d/web_log.conf @@ -0,0 +1,385 @@ + +# make sure we can collect web log data + +template: last_collected_secs + on: web_log.response_codes +families: * + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + + +# ----------------------------------------------------------------------------- +# high level response code alarms + +# the following alarms trigger only when there are enough data. +# we assume there are enough data when: +# +# $1m_requests > 120 +# +# i.e. when there are at least 120 requests during the last minute + +template: 1m_requests + on: web_log.response_statuses +families: * + lookup: sum -1m unaligned + calc: ($this == 0)?(1):($this) + units: requests + every: 10s + info: the sum of all HTTP requests over the last minute + +template: 1m_successful + on: web_log.response_statuses +families: * + lookup: sum -1m unaligned of successful_requests + calc: $this * 100 / $1m_requests + units: % + every: 10s + warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 ) + crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 ) + delay: up 2m down 15m multiplier 1.5 max 1h + info: the ratio of successful HTTP responses (1xx, 2xx, 304, 401) over the last minute + to: webmaster + +template: 1m_redirects + on: web_log.response_statuses +families: * + lookup: sum -1m unaligned of redirects + calc: $this * 100 / $1m_requests + units: % + every: 10s + warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 ) + crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 ) + delay: up 2m down 15m multiplier 1.5 max 1h + info: the ratio of HTTP redirects (3xx except 304) over the last minute + to: webmaster + +template: 1m_bad_requests + on: web_log.response_statuses +families: * + lookup: sum -1m unaligned of bad_requests + calc: $this * 100 / $1m_requests + units: % + every: 10s + warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 ) + crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 ) + delay: up 2m down 15m multiplier 1.5 max 1h + info: the ratio of HTTP bad requests (4xx except 401) over the last minute + to: webmaster + +template: 1m_internal_errors + on: web_log.response_statuses +families: * + lookup: sum -1m unaligned of server_errors + calc: $this * 100 / $1m_requests + units: % + every: 10s + warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 ) + crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 ) + delay: up 2m down 15m multiplier 1.5 max 1h + info: the ratio of HTTP internal server errors (5xx), over the last minute + to: webmaster + +# unmatched lines + +# the following alarms trigger only when there are enough data. +# we assume there are enough data when: +# +# $1m_total_requests > 120 +# +# i.e. when there are at least 120 requests during the last minute + +template: 1m_total_requests + on: web_log.response_codes +families: * + lookup: sum -1m unaligned + calc: ($this == 0)?(1):($this) + units: requests + every: 10s + info: the sum of all HTTP requests over the last minute + +template: 1m_unmatched +on: web_log.response_codes +families: * + lookup: sum -1m unaligned of unmatched + calc: $this * 100 / $1m_total_requests + units: % + every: 10s + warn: ($1m_total_requests > 120) ? ($this > 1) : ( 0 ) + delay: up 1m down 5m multiplier 1.5 max 1h + info: the ratio of unmatched lines, over the last minute + to: webmaster + +# ----------------------------------------------------------------------------- +# web slow + +# the following alarms trigger only when there are enough data. +# we assume there are enough data when: +# +# $1m_requests > 120 +# +# i.e. when there are at least 120 requests during the last minute + +template: 10m_response_time + on: web_log.response_time +families: * + lookup: average -10m unaligned of avg + units: ms + every: 30s + info: the average time to respond to HTTP requests, over the last 10 minutes + +template: web_slow + on: web_log.response_time +families: * + lookup: average -1m unaligned of avg + units: ms + every: 10s + green: 500 + red: 1000 + warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 ) + crit: ($1m_requests > 120) ? ($this > $red && $this > ($10m_response_time * 4) ) : ( 0 ) + delay: down 15m multiplier 1.5 max 1h + info: the average time to respond to HTTP requests, over the last 1 minute + options: no-clear-notification + to: webmaster + +# ----------------------------------------------------------------------------- +# web too many or too few requests + +# the following alarms trigger only when there are enough data. +# we assume there are enough data when: +# +# $5m_successful_old > 120 +# +# i.e. when there were at least 120 requests during the 5 minutes starting +# at -10m and ending at -5m + +template: 5m_successful_old + on: web_log.response_statuses +families: * + lookup: average -5m at -5m unaligned of successful_requests + units: requests/s + every: 30s + info: average rate of successful HTTP requests over the last 5 minutes + +template: 5m_successful + on: web_log.response_statuses +families: * + lookup: average -5m unaligned of successful_requests + units: requests/s + every: 30s + info: average successful HTTP requests over the last 5 minutes + +template: 5m_requests_ratio + on: web_log.response_codes +families: * + calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100) + units: % + every: 30s + warn: ($5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0) + crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0) + delay: down 15m multiplier 1.5 max 1h +options: no-clear-notification + info: the percentage of successful web requests over the last 5 minutes, \ + compared with the previous 5 minutes \ + (clear notification for this alarm will not be sent) + to: webmaster + + + +# ---------------------------------------------------GO-VERSION--------------------------------------------------------- + +# make sure we can collect web log data + +template: web_log_last_collected_secs + on: web_log.requests +families: * + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + +# unmatched lines + +# the following alarms trigger only when there are enough data. +# we assume there are enough data when: +# +# $1m_total_requests > 120 +# +# i.e. when there are at least 120 requests during the last minute + +template: web_log_1m_total_requests + on: web_log.requests +families: * + lookup: sum -1m unaligned + calc: ($this == 0)?(1):($this) + units: requests + every: 10s + info: the sum of all HTTP requests over the last minute + +template: web_log_1m_unmatched + on: web_log.excluded_requests +families: * + lookup: sum -1m unaligned of unmatched + calc: $this * 100 / $web_log_1m_total_requests + units: % + every: 10s + warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 ) + delay: up 1m down 5m multiplier 1.5 max 1h + info: the ratio of unmatched lines, over the last minute + to: webmaster + +# ----------------------------------------------------------------------------- +# high level response code alarms + +# the following alarms trigger only when there are enough data. +# we assume there are enough data when: +# +# $1m_requests > 120 +# +# i.e. when there are at least 120 requests during the last minute + +template: web_log_1m_requests + on: web_log.type_requests +families: * + lookup: sum -1m unaligned + calc: ($this == 0)?(1):($this) + units: requests + every: 10s + info: the sum of all HTTP requests over the last minute + +template: web_log_1m_successful + on: web_log.type_requests +families: * + lookup: sum -1m unaligned of success + calc: $this * 100 / $web_log_1m_requests + units: % + every: 10s + warn: ($web_log_1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 ) + crit: ($web_log_1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 ) + delay: up 2m down 15m multiplier 1.5 max 1h + info: the ratio of successful HTTP responses (1xx, 2xx, 304, 401) over the last minute + to: webmaster + +template: web_log_1m_redirects + on: web_log.type_requests +families: * + lookup: sum -1m unaligned of redirect + calc: $this * 100 / $web_log_1m_requests + units: % + every: 10s + warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 ) + crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 ) + delay: up 2m down 15m multiplier 1.5 max 1h + info: the ratio of HTTP redirects (3xx except 304) over the last minute + to: webmaster + +template: web_log_1m_bad_requests + on: web_log.type_requests +families: * + lookup: sum -1m unaligned of bad + calc: $this * 100 / $web_log_1m_requests + units: % + every: 10s + warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 ) + crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 ) + delay: up 2m down 15m multiplier 1.5 max 1h + info: the ratio of HTTP bad requests (4xx except 401) over the last minute + to: webmaster + +template: web_log_1m_internal_errors + on: web_log.type_requests +families: * + lookup: sum -1m unaligned of error + calc: $this * 100 / $web_log_1m_requests + units: % + every: 10s + warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 ) + crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 ) + delay: up 2m down 15m multiplier 1.5 max 1h + info: the ratio of HTTP internal server errors (5xx), over the last minute + to: webmaster + +# ----------------------------------------------------------------------------- +# web slow + +# the following alarms trigger only when there are enough data. +# we assume there are enough data when: +# +# $1m_requests > 120 +# +# i.e. when there are at least 120 requests during the last minute + +template: web_log_10m_response_time + on: web_log.request_processing_time +families: * + lookup: average -10m unaligned of avg + units: ms + every: 30s + info: the average time to respond to HTTP requests, over the last 10 minutes + +template: web_log_web_slow + on: web_log.request_processing_time +families: * + lookup: average -1m unaligned of avg + units: ms + every: 10s + green: 500 + red: 1000 + warn: ($web_log_1m_requests > 120) ? ($this > $green && $this > ($web_log_10m_response_time * 2) ) : ( 0 ) + crit: ($web_log_1m_requests > 120) ? ($this > $red && $this > ($web_log_10m_response_time * 4) ) : ( 0 ) + delay: down 15m multiplier 1.5 max 1h + info: the average time to respond to HTTP requests, over the last 1 minute + options: no-clear-notification + to: webmaster + +# ----------------------------------------------------------------------------- +# web too many or too few requests + +# the following alarms trigger only when there are enough data. +# we assume there are enough data when: +# +# $5m_successful_old > 120 +# +# i.e. when there were at least 120 requests during the 5 minutes starting +# at -10m and ending at -5m + +template: web_log_5m_successful_old + on: web_log.type_requests +families: * + lookup: average -5m at -5m unaligned of success + units: requests/s + every: 30s + info: average rate of successful HTTP requests over the last 5 minutes + +template: web_log_5m_successful + on: web_log.type_requests +families: * + lookup: average -5m unaligned of success + units: requests/s + every: 30s + info: average successful HTTP requests over the last 5 minutes + +template: web_log_5m_requests_ratio + on: web_log.type_requests +families: * + calc: ($web_log_5m_successful_old > 0)?($web_log_5m_successful * 100 / $web_log_5m_successful_old):(100) + units: % + every: 30s + warn: ($web_log_5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0) + crit: ($web_log_5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0) + delay: down 15m multiplier 1.5 max 1h +options: no-clear-notification + info: the percentage of successful web requests over the last 5 minutes, \ + compared with the previous 5 minutes \ + (clear notification for this alarm will not be sent) + to: webmaster diff --git a/health/health.d/whoisquery.conf b/health/health.d/whoisquery.conf new file mode 100644 index 0000000..275e11d --- /dev/null +++ b/health/health.d/whoisquery.conf @@ -0,0 +1,24 @@ + +# make sure whoisquery is running + +template: whoisquery_last_collected_secs + on: whoisquery.time_until_expiration + calc: $now - $last_collected_t + units: seconds ago + every: 60s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + + +template: whoisquery_days_until_expiration + on: whoisquery.time_until_expiration + calc: $expiry + units: seconds + every: 60s + warn: $this < $days_until_expiration_warning*24*60*60 + crit: $this < $days_until_expiration_critical*24*60*60 + info: domain time until expiration + to: webmaster diff --git a/health/health.d/wmi.conf b/health/health.d/wmi.conf new file mode 100644 index 0000000..0441fc1 --- /dev/null +++ b/health/health.d/wmi.conf @@ -0,0 +1,130 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +## Availability + +template: wmi_last_collected_secs + on: cpu.collector_duration + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +## CPU + +template: wmi_10min_cpu_usage + on: wmi.cpu_utilization_total + os: linux + hosts: * + lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + info: cpu utilization for the last 10 minutes + to: sysadmin + + +## Memory + +template: wmi_ram_in_use + on: wmi.memory_utilization + os: linux + hosts: * + calc: ($used) * 100 / ($used + $available) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: used RAM + to: sysadmin + +template: wmi_swap_in_use + on: wmi.memory_swap_utilization + os: linux + hosts: * + calc: ($used) * 100 / ($used + $available) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: used Swap + to: sysadmin + + +## Network + +template: inbound_packets_discarded + on: wmi.net_discarded + os: linux + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of inbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface inbound discarded packets in the last 10 minutes + to: sysadmin + +template: outbound_packets_discarded + on: wmi.net_discarded + os: linux + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of outbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface outbound discarded packets in the last 10 minutes + to: sysadmin + +template: inbound_packets_errors + on: wmi.net_errors + os: linux + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of inbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface inbound errors in the last 10 minutes + to: sysadmin + +template: outbound_packets_errors + on: wmi.net_errors + os: linux + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of outbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface outbound errors in the last 10 minutes + to: sysadmin + + +## Disk + +template: wmi_disk_in_use + on: wmi.logical_disk_utilization + os: linux + hosts: * + calc: ($used) * 100 / ($used + $free) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: used disk space + to: sysadmin diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf new file mode 100644 index 0000000..dfca377 --- /dev/null +++ b/health/health.d/x509check.conf @@ -0,0 +1,32 @@ + +# make sure x509check is running + +template: x509check_last_collected_secs + on: x509check.time_until_expiration + calc: $now - $last_collected_t + units: seconds ago + every: 60s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + + +template: x509check_days_until_expiration + on: x509check.time_until_expiration + calc: $expiry + units: seconds + every: 60s + warn: $this < $days_until_expiration_warning*24*60*60 + crit: $this < $days_until_expiration_critical*24*60*60 + info: certificate time until expiration + to: webmaster + +template: x509check_revocation_status + on: x509check.revocation_status + calc: $revoked + every: 60s + crit: $this != nan AND $this != 0 + info: certificate revocation status + to: webmaster diff --git a/health/health.d/zfs.conf b/health/health.d/zfs.conf new file mode 100644 index 0000000..af73824 --- /dev/null +++ b/health/health.d/zfs.conf @@ -0,0 +1,10 @@ + + alarm: zfs_memory_throttle + on: zfs.memory_ops + lookup: sum -10m unaligned absolute of throttled + units: events + every: 1m + warn: $this > 0 + delay: down 1h multiplier 1.5 max 2h + info: the number of times ZFS had to limit the ARC growth in the last 10 minutes + to: sysadmin diff --git a/health/health.d/zookeeper.conf b/health/health.d/zookeeper.conf new file mode 100644 index 0000000..ffbe31b --- /dev/null +++ b/health/health.d/zookeeper.conf @@ -0,0 +1,14 @@ + +# make sure zookeeper is running + +template: zookeeper_last_collected_secs + on: zookeeper.requests + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + |