summaryrefslogtreecommitdiffstats
path: root/health/health.d
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2021-12-01 06:15:11 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2021-12-01 06:15:11 +0000
commit483926a283e118590da3f9ecfa75a8a4d62143ce (patch)
treecb77052778df9a128a8cd3ff5bf7645322a13bc5 /health/health.d
parentReleasing debian version 1.31.0-4. (diff)
downloadnetdata-483926a283e118590da3f9ecfa75a8a4d62143ce.tar.xz
netdata-483926a283e118590da3f9ecfa75a8a4d62143ce.zip
Merging upstream version 1.32.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--health/health.d/adaptec_raid.conf8
-rw-r--r--health/health.d/am2320.conf15
-rw-r--r--health/health.d/anomalies.conf8
-rw-r--r--health/health.d/apache.conf17
-rw-r--r--health/health.d/apcupsd.conf12
-rw-r--r--health/health.d/backend.conf12
-rw-r--r--health/health.d/bcache.conf8
-rw-r--r--health/health.d/beanstalkd.conf4
-rw-r--r--health/health.d/bind_rndc.conf4
-rw-r--r--health/health.d/boinc.conf16
-rw-r--r--health/health.d/btrfs.conf16
-rw-r--r--health/health.d/ceph.conf4
-rw-r--r--health/health.d/cgroups.conf8
-rw-r--r--health/health.d/cockroachdb.conf72
-rw-r--r--health/health.d/couchdb.conf16
-rw-r--r--health/health.d/cpu.conf16
-rw-r--r--health/health.d/dbengine.conf16
-rw-r--r--health/health.d/disks.conf26
-rw-r--r--health/health.d/dns_query.conf4
-rw-r--r--health/health.d/dnsmasq_dhcp.conf4
-rw-r--r--health/health.d/dockerd.conf4
-rw-r--r--health/health.d/elasticsearch.conf15
-rw-r--r--health/health.d/entropy.conf4
-rw-r--r--health/health.d/exporting.conf29
-rw-r--r--health/health.d/fping.conf16
-rw-r--r--health/health.d/fronius.conf4
-rw-r--r--health/health.d/gearman.conf20
-rw-r--r--health/health.d/geth.conf12
-rw-r--r--health/health.d/go.d.plugin.conf (renamed from health/health.d/lighttpd.conf)14
-rw-r--r--health/health.d/haproxy.conf21
-rw-r--r--health/health.d/hdfs.conf37
-rw-r--r--health/health.d/httpcheck.conf46
-rw-r--r--health/health.d/ioping.conf4
-rw-r--r--health/health.d/ipc.conf8
-rw-r--r--health/health.d/ipfs.conf4
-rw-r--r--health/health.d/ipmi.conf8
-rw-r--r--health/health.d/kubelet.conf36
-rw-r--r--health/health.d/linux_power_supply.conf4
-rw-r--r--health/health.d/load.conf16
-rw-r--r--health/health.d/mdstat.conf16
-rw-r--r--health/health.d/megacli.conf20
-rw-r--r--health/health.d/memcached.conf29
-rw-r--r--health/health.d/memory.conf12
-rw-r--r--health/health.d/mongodb.conf16
-rw-r--r--health/health.d/mysql.conf62
-rw-r--r--health/health.d/named.conf17
-rw-r--r--health/health.d/net.conf60
-rw-r--r--health/health.d/netfilter.conf4
-rw-r--r--health/health.d/nginx.conf17
-rw-r--r--health/health.d/phpfpm.conf17
-rw-r--r--health/health.d/pihole.conf49
-rw-r--r--health/health.d/portcheck.conf26
-rw-r--r--health/health.d/postgres.conf16
-rw-r--r--health/health.d/processes.conf4
-rw-r--r--health/health.d/pulsar.conf16
-rw-r--r--health/health.d/python.d.plugin.conf (renamed from health/health.d/nginx_plus.conf)14
-rw-r--r--health/health.d/ram.conf48
-rw-r--r--health/health.d/redis.conf24
-rw-r--r--health/health.d/retroshare.conf19
-rw-r--r--health/health.d/riakkv.conf38
-rw-r--r--health/health.d/scaleio.conf24
-rw-r--r--health/health.d/softnet.conf12
-rw-r--r--health/health.d/squid.conf17
-rw-r--r--health/health.d/stiebeleltron.conf4
-rw-r--r--health/health.d/swap.conf10
-rw-r--r--health/health.d/systemdunits.conf40
-rw-r--r--health/health.d/tcp_conn.conf4
-rw-r--r--health/health.d/tcp_listen.conf16
-rw-r--r--health/health.d/tcp_mem.conf4
-rw-r--r--health/health.d/tcp_orphans.conf4
-rw-r--r--health/health.d/tcp_resets.conf16
-rw-r--r--health/health.d/timex.conf17
-rw-r--r--health/health.d/udp_errors.conf8
-rw-r--r--health/health.d/unbound.conf24
-rw-r--r--health/health.d/varnish.conf12
-rw-r--r--health/health.d/vcsa.conf48
-rw-r--r--health/health.d/vernemq.conf120
-rw-r--r--health/health.d/vsphere.conf44
-rw-r--r--health/health.d/web_log.conf135
-rw-r--r--health/health.d/whoisquery.conf21
-rw-r--r--health/health.d/wmi.conf50
-rw-r--r--health/health.d/x509check.conf25
-rw-r--r--health/health.d/zfs.conf12
-rw-r--r--health/health.d/zookeeper.conf17
84 files changed, 616 insertions, 1180 deletions
diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf
index b067e1840..1d823addd 100644
--- a/health/health.d/adaptec_raid.conf
+++ b/health/health.d/adaptec_raid.conf
@@ -3,9 +3,9 @@
template: adaptec_raid_ld_status
on: adaptec_raid.ld_status
- class: System
+ class: Errors
+ type: System
component: RAID
- type: Errors
lookup: max -10s foreach *
units: bool
every: 10s
@@ -18,9 +18,9 @@ component: RAID
template: adaptec_raid_pd_state
on: adaptec_raid.pd_state
- class: System
+ class: Errors
+ type: System
component: RAID
- type: Errors
lookup: max -10s foreach *
units: bool
every: 10s
diff --git a/health/health.d/am2320.conf b/health/health.d/am2320.conf
deleted file mode 100644
index 4bac98fbb..000000000
--- a/health/health.d/am2320.conf
+++ /dev/null
@@ -1,15 +0,0 @@
-# make sure am2320 is sending stats
-
- template: am2320_last_collected_secs
- on: am2320.temperature
- class: Other
-component: Sensors
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf
index f27e39fc1..269ae544b 100644
--- a/health/health.d/anomalies.conf
+++ b/health/health.d/anomalies.conf
@@ -2,9 +2,9 @@
template: anomalies_anomaly_probabilities
on: anomalies.probability
- class: Netdata
+ class: Errors
+ type: Netdata
component: ML
- type: Errors
lookup: average -2m foreach *
every: 1m
warn: $this > 50
@@ -14,9 +14,9 @@ component: ML
template: anomalies_anomaly_flags
on: anomalies.anomaly
- class: Netdata
+ class: Errors
+ type: Netdata
component: ML
- type: Errors
lookup: sum -2m foreach *
every: 1m
warn: $this > 10
diff --git a/health/health.d/apache.conf b/health/health.d/apache.conf
deleted file mode 100644
index c623fb880..000000000
--- a/health/health.d/apache.conf
+++ /dev/null
@@ -1,17 +0,0 @@
-
-# make sure apache is running
-
- template: apache_last_collected_secs
- on: apache.requests
- class: Web Server
-component: Apache
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
index 07b5c28c9..65f1a69ab 100644
--- a/health/health.d/apcupsd.conf
+++ b/health/health.d/apcupsd.conf
@@ -2,9 +2,9 @@
template: apcupsd_10min_ups_load
on: apcupsd.load
- class: Power Supply
+ class: Utilization
+ type: Power Supply
component: UPS
- type: Utilization
os: *
hosts: *
lookup: average -10m unaligned of percentage
@@ -20,9 +20,9 @@ component: UPS
# Fire the alarm as soon as it's going on battery (99% charge) and clear only when full.
template: apcupsd_ups_charge
on: apcupsd.charge
- class: Power Supply
+ class: Errors
+ type: Power Supply
component: UPS
- type: Errors
os: *
hosts: *
lookup: average -60s unaligned of charge
@@ -36,9 +36,9 @@ component: UPS
template: apcupsd_last_collected_secs
on: apcupsd.load
- class: Power Supply
+ class: Latency
+ type: Power Supply
component: UPS device
- type: Latency
calc: $now - $last_collected_t
every: 10s
units: seconds ago
diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf
index 948ea551a..91d469395 100644
--- a/health/health.d/backend.conf
+++ b/health/health.d/backend.conf
@@ -1,9 +1,9 @@
# Alert that backends subsystem will be disabled soon
alarm: backend_metrics_eol
on: netdata.backend_metrics
- class: Netdata
+ class: Errors
+ type: Netdata
component: Exporting engine
- type: Errors
units: boolean
calc: $now - $last_collected_t
every: 1m
@@ -16,9 +16,9 @@ component: Exporting engine
alarm: backend_last_buffering
on: netdata.backend_metrics
- class: Netdata
+ class: Latency
+ type: Netdata
component: Exporting engine
- type: Latency
calc: $now - $last_collected_t
units: seconds ago
every: 10s
@@ -30,9 +30,9 @@ component: Exporting engine
alarm: backend_metrics_sent
on: netdata.backend_metrics
- class: Netdata
+ class: Workload
+ type: Netdata
component: Exporting engine
- type: Workload
units: %
calc: abs($sent) * 100 / abs($buffered)
every: 10s
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
index d75d8e19b..49cb5ad0f 100644
--- a/health/health.d/bcache.conf
+++ b/health/health.d/bcache.conf
@@ -1,9 +1,9 @@
template: bcache_cache_errors
on: disk.bcache_cache_read_races
- class: System
+ class: Errors
+ type: System
component: Disk
- type: Errors
lookup: sum -1m unaligned absolute
units: errors
every: 1m
@@ -16,9 +16,9 @@ component: Disk
template: bcache_cache_dirty
on: disk.bcache_cache_alloc
- class: System
+ class: Utilization
+ type: System
component: Disk
- type: Utilization
calc: $dirty + $metadata + $undefined
units: %
every: 1m
diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf
index 99c754571..13ac8c182 100644
--- a/health/health.d/beanstalkd.conf
+++ b/health/health.d/beanstalkd.conf
@@ -2,9 +2,9 @@
template: beanstalk_server_buried_jobs
on: beanstalk.current_jobs
- class: Messaging
+ class: Workload
+ type: Messaging
component: Beanstalk
- type: Workload
calc: $buried
units: jobs
every: 10s
diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf
index e88f87a4f..7c09225ff 100644
--- a/health/health.d/bind_rndc.conf
+++ b/health/health.d/bind_rndc.conf
@@ -1,8 +1,8 @@
template: bind_rndc_stats_file_size
on: bind_rndc.stats_size
- class: DNS
+ class: Utilization
+ type: DNS
component: BIND
- type: Utilization
units: megabytes
every: 60
calc: $stats_size
diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf
index 8604abee9..7d7a4fdae 100644
--- a/health/health.d/boinc.conf
+++ b/health/health.d/boinc.conf
@@ -3,9 +3,9 @@
# Warn on any compute errors encountered.
template: boinc_compute_errors
on: boinc.states
- class: Computing
+ class: Errors
+ type: Computing
component: BOINC
- type: Errors
os: *
hosts: *
families: *
@@ -21,9 +21,9 @@ component: BOINC
# Warn on lots of upload errors
template: boinc_upload_errors
on: boinc.states
- class: Computing
+ class: Errors
+ type: Computing
component: BOINC
- type: Errors
os: *
hosts: *
families: *
@@ -39,9 +39,9 @@ component: BOINC
# Warn on the task queue being empty
template: boinc_total_tasks
on: boinc.tasks
- class: Computing
+ class: Utilization
+ type: Computing
component: BOINC
- type: Utilization
os: *
hosts: *
families: *
@@ -57,9 +57,9 @@ component: BOINC
# Warn on no active tasks with a non-empty queue
template: boinc_active_tasks
on: boinc.tasks
- class: Computing
+ class: Utilization
+ type: Computing
component: BOINC
- type: Utilization
os: *
hosts: *
families: *
diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf
index d3200a7ee..8d197aa8d 100644
--- a/health/health.d/btrfs.conf
+++ b/health/health.d/btrfs.conf
@@ -1,9 +1,9 @@
template: btrfs_allocated
on: btrfs.disk
- class: System
+ class: Utilization
+ type: System
component: File system
- type: Utilization
os: *
hosts: *
families: *
@@ -18,9 +18,9 @@ component: File system
template: btrfs_data
on: btrfs.data
- class: System
+ class: Utilization
+ type: System
component: File system
- type: Utilization
os: *
hosts: *
families: *
@@ -35,9 +35,9 @@ component: File system
template: btrfs_metadata
on: btrfs.metadata
- class: System
+ class: Utilization
+ type: System
component: File system
- type: Utilization
os: *
hosts: *
families: *
@@ -52,9 +52,9 @@ component: File system
template: btrfs_system
on: btrfs.system
- class: System
+ class: Utilization
+ type: System
component: File system
- type: Utilization
os: *
hosts: *
families: *
diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf
index ed8f9b4b9..1f9da25c7 100644
--- a/health/health.d/ceph.conf
+++ b/health/health.d/ceph.conf
@@ -2,9 +2,9 @@
template: ceph_cluster_space_usage
on: ceph.general_usage
- class: Storage
+ class: Utilization
+ type: Storage
component: Ceph
- type: Utilization
calc: $used * 100 / ($used + $avail)
units: %
every: 1m
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
index 068533f10..45b34806c 100644
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@@ -3,9 +3,9 @@
template: cgroup_10min_cpu_usage
on: cgroup.cpu_limit
- class: Cgroups
+ class: Utilization
+ type: Cgroups
component: CPU
- type: Utilization
os: linux
hosts: *
lookup: average -10m unaligned
@@ -19,9 +19,9 @@ component: CPU
template: cgroup_ram_in_use
on: cgroup.mem_usage
- class: Cgroups
+ class: Utilization
+ type: Cgroups
component: Memory
- type: Utilization
os: linux
hosts: *
calc: ($ram) * 100 / $memory_limit
diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf
index dccd2b064..1f227841e 100644
--- a/health/health.d/cockroachdb.conf
+++ b/health/health.d/cockroachdb.conf
@@ -1,27 +1,11 @@
-# Availability
-
- template: cockroachdb_last_collected_secs
- on: cockroachdb.live_nodes
- class: Database
-component: CockroachDB
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
-
# Capacity
template: cockroachdb_used_storage_capacity
on: cockroachdb.storage_used_capacity_percentage
- class: Database
+ class: Utilization
+ type: Database
component: CockroachDB
- type: Utilization
calc: $capacity_used_percent
units: %
every: 10s
@@ -33,9 +17,9 @@ component: CockroachDB
template: cockroachdb_used_usable_storage_capacity
on: cockroachdb.storage_used_capacity_percentage
- class: Database
+ class: Utilization
+ type: Database
component: CockroachDB
- type: Utilization
calc: $capacity_usable_used_percent
units: %
every: 10s
@@ -49,37 +33,37 @@ component: CockroachDB
template: cockroachdb_unavailable_ranges
on: cockroachdb.ranges_replication_problem
- class: Database
+ class: Errors
+ type: Database
component: CockroachDB
- type: Utilization
calc: $ranges_unavailable
units: num
every: 10s
warn: $this > 0
delay: down 15m multiplier 1.5 max 1h
- info: number of ranges with fewer live replicas than the replication target
+ info: number of ranges with fewer live replicas than needed for quorum
to: dba
- template: cockroachdb_replicas_leaders_not_leaseholders
- on: cockroachdb.replicas_leaders
- class: Database
+ template: cockroachdb_underreplicated_ranges
+ on: cockroachdb.ranges_replication_problem
+ class: Errors
+ type: Database
component: CockroachDB
- type: Utilization
- calc: $replicas_leaders_not_leaseholders
+ calc: $ranges_underreplicated
units: num
every: 10s
warn: $this > 0
delay: down 15m multiplier 1.5 max 1h
- info: number of replicas that are Raft leaders whose range lease is held by another store
+ info: number of ranges with fewer live replicas than the replication target
to: dba
# FD
template: cockroachdb_open_file_descriptors_limit
on: cockroachdb.process_file_descriptors
- class: Database
+ class: Utilization
+ type: Database
component: CockroachDB
- type: Utilization
calc: $sys_fd_open/$sys_fd_softlimit * 100
units: %
every: 10s
@@ -87,29 +71,3 @@ component: CockroachDB
delay: down 15m multiplier 1.5 max 1h
info: open file descriptors utilization (against softlimit)
to: dba
-
-# SQL
-
- template: cockroachdb_sql_active_connections
- on: cockroachdb.sql_connections
- class: Database
-component: CockroachDB
- type: Utilization
- calc: $sql_conns
- units: active connections
- every: 10s
- info: number of active SQL connections
- to: dba
-
- template: cockroachdb_sql_executed_statements_total_last_5m
- on: cockroachdb.sql_statements_total
- class: Database
-component: CockroachDB
- type: Workload
- lookup: sum -5m absolute of sql_query_count
- units: statements
- every: 10s
- warn: $this == 0 AND $cockroachdb_sql_active_connections != 0
- delay: down 15m up 30s multiplier 1.5 max 1h
- info: number of executed SQL statements in the last 5 minutes
- to: dba
diff --git a/health/health.d/couchdb.conf b/health/health.d/couchdb.conf
deleted file mode 100644
index c86c6b988..000000000
--- a/health/health.d/couchdb.conf
+++ /dev/null
@@ -1,16 +0,0 @@
-
-# make sure couchdb is running
-
- template: couchdb_last_collected_secs
- on: couchdb.request_methods
- class: Database
-component: CouchDB
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf
index d11215768..ad6952825 100644
--- a/health/health.d/cpu.conf
+++ b/health/health.d/cpu.conf
@@ -3,9 +3,9 @@
template: 10min_cpu_usage
on: system.cpu
- class: System
+ class: Utilization
+ type: System
component: CPU
- type: Utilization
os: linux
hosts: *
lookup: average -10m unaligned of user,system,softirq,irq,guest
@@ -19,9 +19,9 @@ component: CPU
template: 10min_cpu_iowait
on: system.cpu
- class: System
+ class: Utilization
+ type: System
component: CPU
- type: Utilization
os: linux
hosts: *
lookup: average -10m unaligned of iowait
@@ -35,9 +35,9 @@ component: CPU
template: 20min_steal_cpu
on: system.cpu
- class: System
+ class: Latency
+ type: System
component: CPU
- type: Latency
os: linux
hosts: *
lookup: average -20m unaligned of steal
@@ -52,9 +52,9 @@ component: CPU
## FreeBSD
template: 10min_cpu_usage
on: system.cpu
- class: System
+ class: Utilization
+ type: System
component: CPU
- type: Utilization
os: freebsd
hosts: *
lookup: average -10m unaligned of user,system,interrupt
diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf
index 79c156ab8..65c41b846 100644
--- a/health/health.d/dbengine.conf
+++ b/health/health.d/dbengine.conf
@@ -3,9 +3,9 @@
alarm: 10min_dbengine_global_fs_errors
on: netdata.dbengine_global_errors
- class: Netdata
+ class: Errors
+ type: Netdata
component: DB engine
- type: Errors
os: linux freebsd macos
hosts: *
lookup: sum -10m unaligned of fs_errors
@@ -18,9 +18,9 @@ component: DB engine
alarm: 10min_dbengine_global_io_errors
on: netdata.dbengine_global_errors
- class: Netdata
+ class: Errors
+ type: Netdata
component: DB engine
- type: Errors
os: linux freebsd macos
hosts: *
lookup: sum -10m unaligned of io_errors
@@ -33,9 +33,9 @@ component: DB engine
alarm: 10min_dbengine_global_flushing_warnings
on: netdata.dbengine_global_errors
- class: Netdata
+ class: Errors
+ type: Netdata
component: DB engine
- type: Errors
os: linux freebsd macos
hosts: *
lookup: sum -10m unaligned of pg_cache_over_half_dirty_events
@@ -49,9 +49,9 @@ component: DB engine
alarm: 10min_dbengine_global_flushing_errors
on: netdata.dbengine_long_term_page_stats
- class: Netdata
+ class: Errors
+ type: Netdata
component: DB engine
- type: Errors
os: linux freebsd macos
hosts: *
lookup: sum -10m unaligned of flushing_pressure_deletions
diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf
index 60f8faed9..5daff61a1 100644
--- a/health/health.d/disks.conf
+++ b/health/health.d/disks.conf
@@ -11,9 +11,9 @@
template: disk_space_usage
on: disk.space
- class: System
+ class: Utilization
+ type: System
component: Disk
- type: Utilization
os: linux freebsd
hosts: *
families: !/dev !/dev/* !/run !/run/* *
@@ -28,9 +28,9 @@ component: Disk
template: disk_inode_usage
on: disk.inodes
- class: System
+ class: Utilization
+ type: System
component: Disk
- type: Utilization
os: linux freebsd
hosts: *
families: !/dev !/dev/* !/run !/run/* *
@@ -136,19 +136,16 @@ component: Disk
template: 10min_disk_utilization
on: disk.util
- class: System
+ class: Utilization
+ type: System
component: Disk
- type: Utilization
os: linux freebsd
hosts: *
families: *
lookup: average -10m unaligned
units: %
every: 1m
- green: 90
- red: 98
- warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1))
- crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
+ warn: $this > 98 * (($status >= $WARNING) ? (0.7) : (1))
delay: down 15m multiplier 1.2 max 1h
info: average percentage of time $family disk was busy over the last 10 minutes
to: silent
@@ -161,19 +158,16 @@ component: Disk
template: 10min_disk_backlog
on: disk.backlog
- class: System
+ class: Latency
+ type: System
component: Disk
- type: Latency
os: linux
hosts: *
families: *
lookup: average -10m unaligned
units: ms
every: 1m
- green: 2000
- red: 5000
- warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1))
- crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
+ warn: $this > 5000 * (($status >= $WARNING) ? (0.7) : (1))
delay: down 15m multiplier 1.2 max 1h
info: average backlog size of the $family disk over the last 10 minutes
to: silent
diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf
index 1fbb2c598..ec4937c0a 100644
--- a/health/health.d/dns_query.conf
+++ b/health/health.d/dns_query.conf
@@ -3,9 +3,9 @@
template: dns_query_time_query_time
on: dns_query_time.query_time
- class: DNS
+ class: Latency
+ type: DNS
component: DNS
- type: Latency
lookup: average -10s unaligned foreach *
units: ms
every: 10s
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
index 10d139f77..010b94599 100644
--- a/health/health.d/dnsmasq_dhcp.conf
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -2,9 +2,9 @@
template: dnsmasq_dhcp_dhcp_range_utilization
on: dnsmasq_dhcp.dhcp_range_utilization
- class: DHCP
+ class: Utilization
+ type: DHCP
component: Dnsmasq
- type: Utilization
every: 10s
units: %
calc: $used
diff --git a/health/health.d/dockerd.conf b/health/health.d/dockerd.conf
index ba866f81b..220ddd664 100644
--- a/health/health.d/dockerd.conf
+++ b/health/health.d/dockerd.conf
@@ -1,8 +1,8 @@
template: docker_unhealthy_containers
on: docker.unhealthy_containers
- class: Containers
+ class: Errors
+ type: Containers
component: Docker
- type: Errors
units: unhealthy containers
every: 10s
lookup: average -10s
diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf
deleted file mode 100644
index 05d576c39..000000000
--- a/health/health.d/elasticsearch.conf
+++ /dev/null
@@ -1,15 +0,0 @@
-
-# make sure elasticsearch is running
-
- template: elasticsearch_last_collected
- on: elasticsearch.cluster_health_status
- class: Search engine
-component: Elasticsearch
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- info: number of seconds since the last successful data collection
- to: sysadmin
diff --git a/health/health.d/entropy.conf b/health/health.d/entropy.conf
index 0478fa0be..13b0fcde4 100644
--- a/health/health.d/entropy.conf
+++ b/health/health.d/entropy.conf
@@ -5,9 +5,9 @@
alarm: lowest_entropy
on: system.entropy
- class: System
+ class: Utilization
+ type: System
component: Cryptography
- type: Utilization
os: linux
hosts: *
lookup: min -5m unaligned
diff --git a/health/health.d/exporting.conf b/health/health.d/exporting.conf
index 4430f3fd8..06f398c6e 100644
--- a/health/health.d/exporting.conf
+++ b/health/health.d/exporting.conf
@@ -1,22 +1,25 @@
-template: exporting_last_buffering
-families: *
- on: exporting_data_size
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful buffering of exporting data
- to: dba
+ template: exporting_last_buffering
+ families: *
+ on: exporting_data_size
+ class: Latency
+ type: Netdata
+component: Exporting engine
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful buffering of exporting data
+ to: dba
template: exporting_metrics_sent
families: *
on: exporting_data_size
- class: Netdata
+ class: Workload
+ type: Netdata
component: Exporting engine
- type: Workload
units: %
calc: abs($sent) * 100 / abs($buffered)
every: 10s
diff --git a/health/health.d/fping.conf b/health/health.d/fping.conf
index 120fe8f28..bb22419fa 100644
--- a/health/health.d/fping.conf
+++ b/health/health.d/fping.conf
@@ -2,9 +2,9 @@
template: fping_last_collected_secs
families: *
on: fping.latency
- class: Other
+ class: Latency
+ type: Other
component: Network
- type: Latency
calc: $now - $last_collected_t
units: seconds ago
every: 10s
@@ -17,9 +17,9 @@ component: Network
template: fping_host_reachable
families: *
on: fping.latency
- class: Other
+ class: Errors
+ type: Other
component: Network
- type: Errors
calc: $average != nan
units: up/down
every: 10s
@@ -31,9 +31,9 @@ component: Network
template: fping_host_latency
families: *
on: fping.latency
- class: Other
+ class: Latency
+ type: Other
component: Network
- type: Latency
lookup: average -10s unaligned of average
units: ms
every: 10s
@@ -48,9 +48,9 @@ component: Network
template: fping_packet_loss
families: *
on: fping.quality
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
lookup: average -10m unaligned of returned
calc: 100 - $this
green: 1
diff --git a/health/health.d/fronius.conf b/health/health.d/fronius.conf
index 81aafaa60..853bd7fbc 100644
--- a/health/health.d/fronius.conf
+++ b/health/health.d/fronius.conf
@@ -1,9 +1,9 @@
template: fronius_last_collected_secs
families: *
on: fronius.power
- class: Power Supply
+ class: Latency
+ type: Power Supply
component: Solar
- type: Latency
calc: $now - $last_collected_t
every: 10s
units: seconds ago
diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf
index e2031bf2b..14010d445 100644
--- a/health/health.d/gearman.conf
+++ b/health/health.d/gearman.conf
@@ -1,24 +1,10 @@
-# make sure Gearman is running
- template: gearman_last_collected_secs
- on: gearman.total_jobs
- class: Computing
-component: Gearman
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
template: gearman_workers_queued
on: gearman.single_job
- class: Computing
+ class: Latency
+ type: Computing
component: Gearman
- type: Latency
- lookup: average -10m unaligned match-names of Queued
+ lookup: average -10m unaligned match-names of Pending
units: workers
every: 10s
warn: $this > 30000
diff --git a/health/health.d/geth.conf b/health/health.d/geth.conf
new file mode 100644
index 000000000..dd1eb4701
--- /dev/null
+++ b/health/health.d/geth.conf
@@ -0,0 +1,12 @@
+#chainhead_header is expected momenterarily to be ahead. If its considerably ahead (e.g more than 5 blocks), then the node is definitely out of sync.
+ template: geth_chainhead_diff_between_header_block
+ on: geth.chainhead
+ class: Workload
+ type: ethereum_node
+component: geth
+ every: 10s
+ calc: $chain_head_block - $chain_head_header
+ units: blocks
+ warn: $this != 0
+ crit: $this > 5
+ delay: down 1m multiplier 1.5 max 1h
diff --git a/health/health.d/lighttpd.conf b/health/health.d/go.d.plugin.conf
index 0f067549e..8bf84a976 100644
--- a/health/health.d/lighttpd.conf
+++ b/health/health.d/go.d.plugin.conf
@@ -1,11 +1,12 @@
-# make sure lighttpd is running
+# make sure go.d.plugin data collection job is running
- template: lighttpd_last_collected_secs
- on: lighttpd.requests
- class: Web Server
-component: Lighttpd
- type: Latency
+ template: go.d_job_last_collected_secs
+ on: netdata.go_plugin_execution_time
+ class: Error
+ type: Netdata
+component: go.d.plugin
+ module: *
calc: $now - $last_collected_t
units: seconds ago
every: 10s
@@ -14,4 +15,3 @@ component: Lighttpd
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
to: webmaster
-
diff --git a/health/health.d/haproxy.conf b/health/health.d/haproxy.conf
index 9f6b1c577..a0ab52bca 100644
--- a/health/health.d/haproxy.conf
+++ b/health/health.d/haproxy.conf
@@ -1,8 +1,8 @@
template: haproxy_backend_server_status
on: haproxy_hs.down
- class: Web Proxy
+ class: Errors
+ type: Web Proxy
component: HAProxy
- type: Errors
units: failed servers
every: 10s
lookup: average -10s
@@ -12,25 +12,12 @@ component: HAProxy
template: haproxy_backend_status
on: haproxy_hb.down
- class: Web Proxy
+ class: Errors
+ type: Web Proxy
component: HAProxy
- type: Errors
units: failed backend
every: 10s
lookup: average -10s
crit: $this > 0
info: average number of failed haproxy backends over the last 10 seconds
to: sysadmin
-
- template: haproxy_last_collected
- on: haproxy_hb.down
- class: Web Proxy
-component: HAProxy
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- info: number of seconds since the last successful data collection
- to: sysadmin
diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf
index bd8308bed..ca8df31b9 100644
--- a/health/health.d/hdfs.conf
+++ b/health/health.d/hdfs.conf
@@ -1,28 +1,11 @@
-# make sure hdfs is running
-
- template: hdfs_last_collected_secs
- on: hdfs.heap_memory
- class: Storage
-component: HDFS
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
-
# Common
template: hdfs_capacity_usage
on: hdfs.capacity
- class: Storage
+ class: Utilization
+ type: Storage
component: HDFS
- type: Utilization
calc: ($used) * 100 / ($used + $remaining)
units: %
every: 10s
@@ -37,9 +20,9 @@ component: HDFS
template: hdfs_missing_blocks
on: hdfs.blocks
- class: Storage
+ class: Errors
+ type: Storage
component: HDFS
- type: Errors
calc: $missing
units: missing blocks
every: 10s
@@ -51,9 +34,9 @@ component: HDFS
template: hdfs_stale_nodes
on: hdfs.data_nodes
- class: Storage
+ class: Errors
+ type: Storage
component: HDFS
- type: Errors
calc: $stale
units: dead nodes
every: 10s
@@ -65,9 +48,9 @@ component: HDFS
template: hdfs_dead_nodes
on: hdfs.data_nodes
- class: Storage
+ class: Errors
+ type: Storage
component: HDFS
- type: Errors
calc: $dead
units: dead nodes
every: 10s
@@ -81,9 +64,9 @@ component: HDFS
template: hdfs_num_failed_volumes
on: hdfs.num_failed_volumes
- class: Storage
+ class: Errors
+ type: Storage
component: HDFS
- type: Errors
calc: $fsds_num_failed_volumes
units: failed volumes
every: 10s
diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf
index d4d6376a3..599c47acc 100644
--- a/health/health.d/httpcheck.conf
+++ b/health/health.d/httpcheck.conf
@@ -1,25 +1,11 @@
- template: httpcheck_last_collected_secs
- families: *
- on: httpcheck.status
- class: Other
-component: HTTP endpoint
- type: Latency
- calc: $now - $last_collected_t
- every: 10s
- units: seconds ago
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
template: httpcheck_web_service_up
families: *
on: httpcheck.status
- class: Web Server
+ class: Utilization
+ type: Web Server
component: HTTP endpoint
- type: Utilization
lookup: average -1m unaligned percentage of success
calc: ($this < 75) ? (0) : ($this)
every: 5s
@@ -30,9 +16,9 @@ component: HTTP endpoint
template: httpcheck_web_service_bad_content
families: *
on: httpcheck.status
- class: Web Server
+ class: Workload
+ type: Web Server
component: HTTP endpoint
- type: Workload
lookup: average -5m unaligned percentage of bad_content
every: 10s
units: %
@@ -46,9 +32,9 @@ component: HTTP endpoint
template: httpcheck_web_service_bad_status
families: *
on: httpcheck.status
- class: Web Server
+ class: Workload
+ type: Web Server
component: HTTP endpoint
- type: Workload
lookup: average -5m unaligned percentage of bad_status
every: 10s
units: %
@@ -62,9 +48,9 @@ component: HTTP endpoint
template: httpcheck_web_service_timeouts
families: *
on: httpcheck.status
- class: Web Server
+ class: Latency
+ type: Web Server
component: HTTP endpoint
- type: Latency
lookup: average -5m unaligned percentage of timeout
every: 10s
units: %
@@ -73,9 +59,9 @@ component: HTTP endpoint
template: httpcheck_no_web_service_connections
families: *
on: httpcheck.status
- class: Other
+ class: Errors
+ type: Other
component: HTTP endpoint
- type: Errors
lookup: average -5m unaligned percentage of no_connection
every: 10s
units: %
@@ -85,9 +71,9 @@ component: HTTP endpoint
template: httpcheck_web_service_unreachable
families: *
on: httpcheck.status
- class: Web Server
+ class: Errors
+ type: Web Server
component: HTTP endpoint
- type: Errors
calc: ($httpcheck_no_web_service_connections >= $httpcheck_web_service_timeouts) ? ($httpcheck_no_web_service_connections) : ($httpcheck_web_service_timeouts)
units: %
every: 10s
@@ -101,9 +87,9 @@ component: HTTP endpoint
template: httpcheck_1h_web_service_response_time
families: *
on: httpcheck.responsetime
- class: Other
+ class: Latency
+ type: Other
component: HTTP endpoint
- type: Latency
lookup: average -1h unaligned of time
every: 30s
units: ms
@@ -112,9 +98,9 @@ component: HTTP endpoint
template: httpcheck_web_service_slow
families: *
on: httpcheck.responsetime
- class: Web Server
+ class: Latency
+ type: Web Server
component: HTTP endpoint
- type: Latency
lookup: average -3m unaligned of time
units: ms
every: 10s
diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf
index 57ce4e866..ee4befbea 100644
--- a/health/health.d/ioping.conf
+++ b/health/health.d/ioping.conf
@@ -1,9 +1,9 @@
template: ioping_disk_latency
families: *
on: ioping.latency
- class: System
+ class: Latency
+ type: System
component: Disk
- type: Latency
lookup: average -10s unaligned of average
units: ms
every: 10s
diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf
index 6eaf7abe9..c178a410a 100644
--- a/health/health.d/ipc.conf
+++ b/health/health.d/ipc.conf
@@ -3,9 +3,9 @@
alarm: semaphores_used
on: system.ipc_semaphores
- class: System
+ class: Utilization
+ type: System
component: IPC
- type: Utilization
os: linux
hosts: *
calc: $semaphores * 100 / $ipc_semaphores_max
@@ -19,9 +19,9 @@ component: IPC
alarm: semaphore_arrays_used
on: system.ipc_semaphore_arrays
- class: System
+ class: Utilization
+ type: System
component: IPC
- type: Utilization
os: linux
hosts: *
calc: $arrays * 100 / $ipc_semaphores_arrays_max
diff --git a/health/health.d/ipfs.conf b/health/health.d/ipfs.conf
index 6268f4092..a514ddfd0 100644
--- a/health/health.d/ipfs.conf
+++ b/health/health.d/ipfs.conf
@@ -1,9 +1,9 @@
template: ipfs_datastore_usage
on: ipfs.repo_size
- class: Data Sharing
+ class: Utilization
+ type: Data Sharing
component: IPFS
- type: Utilization
calc: $size * 100 / $avail
units: %
every: 10s
diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf
index d4fdc6c79..feadba1b7 100644
--- a/health/health.d/ipmi.conf
+++ b/health/health.d/ipmi.conf
@@ -1,8 +1,8 @@
alarm: ipmi_sensors_states
on: ipmi.sensors_states
- class: System
+ class: Errors
+ type: System
component: IPMI
- type: Errors
calc: $warning + $critical
units: sensors
every: 10s
@@ -14,9 +14,9 @@ component: IPMI
alarm: ipmi_events
on: ipmi.events
- class: System
+ class: Utilization
+ type: System
component: IPMI
- type: Utilization
calc: $events
units: events
every: 10s
diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf
index 4d3c45f97..c2778cc5e 100644
--- a/health/health.d/kubelet.conf
+++ b/health/health.d/kubelet.conf
@@ -6,9 +6,9 @@
template: kubelet_node_config_error
on: k8s_kubelet.kubelet_node_config_error
- class: Kubernetes
+ class: Errors
+ type: Kubernetes
component: Kubelet
- type: Errors
calc: $kubelet_node_config_error
units: bool
every: 10s
@@ -22,9 +22,9 @@ component: Kubelet
template: kubelet_token_requests
lookup: sum -10s of token_fail_count
on: k8s_kubelet.kubelet_token_requests
- class: Kubernetes
+ class: Errors
+ type: Kubernetes
component: Kubelet
- type: Errors
units: failed requests
every: 10s
warn: $this > 0
@@ -37,9 +37,9 @@ component: Kubelet
template: kubelet_operations_error
lookup: sum -1m
on: k8s_kubelet.kubelet_operations_errors
- class: Kubernetes
+ class: Errors
+ type: Kubernetes
component: Kubelet
- type: Errors
units: errors
every: 10s
warn: $this > (($status >= $WARNING) ? (0) : (20))
@@ -64,9 +64,9 @@ component: Kubelet
template: kubelet_1m_pleg_relist_latency_quantile_05
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
- class: Kubernetes
+ class: Latency
+ type: Kubernetes
component: Kubelet
- type: Latency
lookup: average -1m unaligned of kubelet_pleg_relist_latency_05
units: microseconds
every: 10s
@@ -74,9 +74,9 @@ component: Kubelet
template: kubelet_10s_pleg_relist_latency_quantile_05
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
- class: Kubernetes
+ class: Latency
+ type: Kubernetes
component: Kubelet
- type: Latency
lookup: average -10s unaligned of kubelet_pleg_relist_latency_05
calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05))
every: 10s
@@ -92,9 +92,9 @@ component: Kubelet
template: kubelet_1m_pleg_relist_latency_quantile_09
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
- class: Kubernetes
+ class: Latency
+ type: Kubernetes
component: Kubelet
- type: Latency
lookup: average -1m unaligned of kubelet_pleg_relist_latency_09
units: microseconds
every: 10s
@@ -102,9 +102,9 @@ component: Kubelet
template: kubelet_10s_pleg_relist_latency_quantile_09
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
- class: Kubernetes
+ class: Latency
+ type: Kubernetes
component: Kubelet
- type: Latency
lookup: average -10s unaligned of kubelet_pleg_relist_latency_09
calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09))
every: 10s
@@ -120,9 +120,9 @@ component: Kubelet
template: kubelet_1m_pleg_relist_latency_quantile_099
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
- class: Kubernetes
+ class: Latency
+ type: Kubernetes
component: Kubelet
- type: Latency
lookup: average -1m unaligned of kubelet_pleg_relist_latency_099
units: microseconds
every: 10s
@@ -130,9 +130,9 @@ component: Kubelet
template: kubelet_10s_pleg_relist_latency_quantile_099
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
- class: Kubernetes
+ class: Latency
+ type: Kubernetes
component: Kubelet
- type: Latency
lookup: average -10s unaligned of kubelet_pleg_relist_latency_099
calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099))
every: 10s
diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf
index e28c246a3..c0bc6de8a 100644
--- a/health/health.d/linux_power_supply.conf
+++ b/health/health.d/linux_power_supply.conf
@@ -2,9 +2,9 @@
template: linux_power_supply_capacity
on: powersupply.capacity
- class: Power Supply
+ class: Utilization
+ type: Power Supply
component: Battery
- type: Utilization
calc: $capacity
units: %
every: 10s
diff --git a/health/health.d/load.conf b/health/health.d/load.conf
index e811f6ee2..0bd872f85 100644
--- a/health/health.d/load.conf
+++ b/health/health.d/load.conf
@@ -6,9 +6,9 @@
# minute, with a special case for a single CPU of setting the trigger at 2.
alarm: load_cpu_number
on: system.load
- class: System
+ class: Utilization
+ type: System
component: Load
- type: Utilization
os: linux
hosts: *
calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors )
@@ -22,9 +22,9 @@ component: Load
alarm: load_average_15
on: system.load
- class: System
+ class: Utilization
+ type: System
component: Load
- type: Utilization
os: linux
hosts: *
lookup: max -1m unaligned of load15
@@ -37,9 +37,9 @@ component: Load
alarm: load_average_5
on: system.load
- class: System
+ class: Utilization
+ type: System
component: Load
- type: Utilization
os: linux
hosts: *
lookup: max -1m unaligned of load5
@@ -52,9 +52,9 @@ component: Load
alarm: load_average_1
on: system.load
- class: System
+ class: Utilization
+ type: System
component: Load
- type: Utilization
os: linux
hosts: *
lookup: max -1m unaligned of load1
diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf
index 67483b201..cedaa000e 100644
--- a/health/health.d/mdstat.conf
+++ b/health/health.d/mdstat.conf
@@ -1,8 +1,8 @@
template: mdstat_last_collected
on: md.disks
- class: System
+ class: Latency
+ type: System
component: RAID
- type: Latency
calc: $now - $last_collected_t
units: seconds ago
every: 10s
@@ -13,9 +13,9 @@ component: RAID
template: mdstat_disks
on: md.disks
- class: System
+ class: Errors
+ type: System
component: RAID
- type: Errors
units: failed devices
every: 10s
calc: $down
@@ -26,9 +26,9 @@ component: RAID
template: mdstat_mismatch_cnt
on: md.mismatch_cnt
- class: System
+ class: Errors
+ type: System
component: RAID
- type: Errors
families: !*(raid1) !*(raid10) *
units: unsynchronized blocks
calc: $count
@@ -40,9 +40,9 @@ component: RAID
template: mdstat_nonredundant_last_collected
on: md.nonredundant
- class: System
+ class: Latency
+ type: System
component: RAID
- type: Latency
calc: $now - $last_collected_t
units: seconds ago
every: 10s
diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf
index 1b6502f62..9fbcfdb92 100644
--- a/health/health.d/megacli.conf
+++ b/health/health.d/megacli.conf
@@ -3,9 +3,9 @@
template: megacli_adapter_state
on: megacli.adapter_degraded
- class: System
+ class: Errors
+ type: System
component: RAID
- type: Errors
lookup: max -10s foreach *
units: boolean
every: 10s
@@ -18,9 +18,9 @@ component: RAID
template: megacli_pd_predictive_failures
on: megacli.pd_predictive_failure
- class: System
+ class: Errors
+ type: System
component: RAID
- type: Errors
lookup: sum -10s foreach *
units: predictive failures
every: 10s
@@ -31,9 +31,9 @@ component: RAID
template: megacli_pd_media_errors
on: megacli.pd_media_error
- class: System
+ class: Errors
+ type: System
component: RAID
- type: Errors
lookup: sum -10s foreach *
units: media errors
every: 10s
@@ -46,9 +46,9 @@ component: RAID
template: megacli_bbu_relative_charge
on: megacli.bbu_relative_charge
- class: System
+ class: Workload
+ type: System
component: RAID
- type: Workload
lookup: average -10s
units: percent
every: 10s
@@ -59,9 +59,9 @@ component: RAID
template: megacli_bbu_cycle_count
on: megacli.bbu_cycle_count
- class: System
+ class: Workload
+ type: System
component: RAID
- type: Workload
lookup: average -10s
units: cycles
every: 10s
diff --git a/health/health.d/memcached.conf b/health/health.d/memcached.conf
index f4b734c38..2a2fe4b82 100644
--- a/health/health.d/memcached.conf
+++ b/health/health.d/memcached.conf
@@ -1,28 +1,11 @@
-# make sure memcached is running
-
- template: memcached_last_collected_secs
- on: memcached.cache
- class: KV Storage
-component: Memcached
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
-
-
# detect if memcached cache is full
template: memcached_cache_memory_usage
on: memcached.cache
- class: KV Storage
+ class: Utilization
+ type: KV Storage
component: Memcached
- type: Utilization
calc: $used * 100 / ($used + $available)
units: %
every: 10s
@@ -37,9 +20,9 @@ component: Memcached
template: memcached_cache_fill_rate
on: memcached.cache
- class: KV Storage
+ class: Utilization
+ type: KV Storage
component: Memcached
- type: Utilization
lookup: min -10m at -50m unaligned of available
calc: ($this - $available) / (($now - $after) / 3600)
units: KB/hour
@@ -51,9 +34,9 @@ component: Memcached
template: memcached_out_of_cache_space_time
on: memcached.cache
- class: KV Storage
+ class: Utilization
+ type: KV Storage
component: Memcached
- type: Utilization
calc: ($memcached_cache_fill_rate > 0) ? ($available / $memcached_cache_fill_rate) : (inf)
units: hours
every: 10s
diff --git a/health/health.d/memory.conf b/health/health.d/memory.conf
index ab651315f..010cbbd7b 100644
--- a/health/health.d/memory.conf
+++ b/health/health.d/memory.conf
@@ -3,9 +3,9 @@
alarm: 1hour_ecc_memory_correctable
on: mem.ecc_ce
- class: System
+ class: Errors
+ type: System
component: Memory
- type: Errors
os: linux
hosts: *
lookup: sum -10m unaligned
@@ -18,9 +18,9 @@ component: Memory
alarm: 1hour_ecc_memory_uncorrectable
on: mem.ecc_ue
- class: System
+ class: Errors
+ type: System
component: Memory
- type: Errors
os: linux
hosts: *
lookup: sum -10m unaligned
@@ -33,9 +33,9 @@ component: Memory
alarm: 1hour_memory_hw_corrupted
on: mem.hwcorrupt
- class: System
+ class: Errors
+ type: System
component: Memory
- type: Errors
os: linux
hosts: *
calc: $HardwareCorrupted
diff --git a/health/health.d/mongodb.conf b/health/health.d/mongodb.conf
deleted file mode 100644
index 8c9bdeb6f..000000000
--- a/health/health.d/mongodb.conf
+++ /dev/null
@@ -1,16 +0,0 @@
-
-# make sure mongodb is running
-
- template: mongodb_last_collected_secs
- on: mongodb.read_operations
- class: Database
-component: MongoDB
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf
index 91860c4a7..34452d983 100644
--- a/health/health.d/mysql.conf
+++ b/health/health.d/mysql.conf
@@ -1,29 +1,11 @@
-# make sure mysql is running
-
- template: mysql_last_collected_secs
- on: mysql.queries
- class: Database
-component: MySQL
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
-
-
-# -----------------------------------------------------------------------------
# slow queries
template: mysql_10s_slow_queries
on: mysql.queries
- class: Database
+ class: Latency
+ type: Database
component: MySQL
- type: Latency
lookup: sum -10s of slow_queries
units: slow queries
every: 10s
@@ -39,9 +21,9 @@ component: MySQL
template: mysql_10s_table_locks_immediate
on: mysql.table_locks
- class: Database
+ class: Utilization
+ type: Database
component: MySQL
- type: Utilization
lookup: sum -10s absolute of immediate
units: immediate locks
every: 10s
@@ -50,9 +32,9 @@ component: MySQL
template: mysql_10s_table_locks_waited
on: mysql.table_locks
- class: Database
+ class: Latency
+ type: Database
component: MySQL
- type: Latency
lookup: sum -10s absolute of waited
units: waited locks
every: 10s
@@ -61,9 +43,9 @@ component: MySQL
template: mysql_10s_waited_locks_ratio
on: mysql.table_locks
- class: Database
+ class: Latency
+ type: Database
component: MySQL
- type: Latency
calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0
units: %
every: 10s
@@ -79,9 +61,9 @@ component: MySQL
template: mysql_connections
on: mysql.connections_active
- class: Database
+ class: Utilization
+ type: Database
component: MySQL
- type: Utilization
calc: $active * 100 / $limit
units: %
every: 10s
@@ -97,9 +79,9 @@ component: MySQL
template: mysql_replication
on: mysql.slave_status
- class: Database
+ class: Errors
+ type: Database
component: MySQL
- type: Errors
calc: ($sql_running <= 0 OR $io_running <= 0)?0:1
units: ok/failed
every: 10s
@@ -110,9 +92,9 @@ component: MySQL
template: mysql_replication_lag
on: mysql.slave_behind
- class: Database
+ class: Latency
+ type: Database
component: MySQL
- type: Errors
calc: $seconds
units: seconds
every: 10s
@@ -129,9 +111,9 @@ component: MySQL
template: mysql_galera_cluster_size_max_2m
on: mysql.galera_cluster_size
- class: Database
+ class: Utilization
+ type: Database
component: MySQL
- type: Utilization
lookup: max -2m absolute
units: nodes
every: 10s
@@ -140,9 +122,9 @@ component: MySQL
template: mysql_galera_cluster_size
on: mysql.galera_cluster_size
- class: Database
+ class: Utilization
+ type: Database
component: MySQL
- type: Utilization
calc: $nodes
units: nodes
every: 10s
@@ -156,9 +138,9 @@ component: MySQL
template: mysql_galera_cluster_state
on: mysql.galera_cluster_state
- class: Database
+ class: Errors
+ type: Database
component: MySQL
- type: Errors
calc: $state
every: 10s
warn: $this == 2 OR $this == 3
@@ -173,9 +155,9 @@ component: MySQL
template: mysql_galera_cluster_status
on: mysql.galera_cluster_status
- class: Database
+ class: Errors
+ type: Database
component: MySQL
- type: Errors
calc: $wsrep_cluster_status
every: 10s
crit: $mysql_galera_cluster_state != nan AND $this != 0
diff --git a/health/health.d/named.conf b/health/health.d/named.conf
deleted file mode 100644
index 90266df16..000000000
--- a/health/health.d/named.conf
+++ /dev/null
@@ -1,17 +0,0 @@
-
-# make sure named is running
-
- template: named_last_collected_secs
- on: named.global_queries
- class: DNS
-component: BIND
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: domainadmin
-
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
index 04219e163..028ca7b81 100644
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -6,9 +6,9 @@
template: interface_speed
on: net.net
- class: System
+ class: Latency
+ type: System
component: Network
- type: Latency
os: *
hosts: *
families: *
@@ -19,9 +19,9 @@ component: Network
template: 1m_received_traffic_overflow
on: net.net
- class: System
+ class: Workload
+ type: System
component: Network
- type: Workload
os: linux
hosts: *
families: *
@@ -36,9 +36,9 @@ component: Network
template: 1m_sent_traffic_overflow
on: net.net
- class: System
+ class: Workload
+ type: System
component: Network
- type: Workload
os: linux
hosts: *
families: *
@@ -63,9 +63,9 @@ component: Network
template: inbound_packets_dropped
on: net.drops
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
families: !net* *
@@ -76,9 +76,9 @@ component: Network
template: outbound_packets_dropped
on: net.drops
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
families: !net* *
@@ -89,14 +89,14 @@ component: Network
template: inbound_packets_dropped_ratio
on: net.packets
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
families: !net* !wl* *
lookup: sum -10m unaligned absolute of received
- calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0))
+ calc: (($inbound_packets_dropped != nan AND $this > 10000) ? ($inbound_packets_dropped * 100 / $this) : (0))
units: %
every: 1m
warn: $this >= 2
@@ -106,9 +106,9 @@ component: Network
template: outbound_packets_dropped_ratio
on: net.packets
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
families: !net* !wl* *
@@ -123,14 +123,14 @@ component: Network
template: wifi_inbound_packets_dropped_ratio
on: net.packets
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
families: wl*
lookup: sum -10m unaligned absolute of received
- calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0))
+ calc: (($inbound_packets_dropped != nan AND $this > 10000) ? ($inbound_packets_dropped * 100 / $this) : (0))
units: %
every: 1m
warn: $this >= 10
@@ -140,9 +140,9 @@ component: Network
template: wifi_outbound_packets_dropped_ratio
on: net.packets
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
families: wl*
@@ -160,9 +160,9 @@ component: Network
template: interface_inbound_errors
on: net.errors
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: freebsd
hosts: *
families: *
@@ -176,9 +176,9 @@ component: Network
template: interface_outbound_errors
on: net.errors
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: freebsd
hosts: *
families: *
@@ -200,9 +200,9 @@ component: Network
template: 10min_fifo_errors
on: net.fifo
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
families: *
@@ -225,9 +225,9 @@ component: Network
template: 1m_received_packets_rate
on: net.packets
- class: System
+ class: Workload
+ type: System
component: Network
- type: Workload
os: linux freebsd
hosts: *
families: *
@@ -238,9 +238,9 @@ component: Network
template: 10s_received_packets_storm
on: net.packets
- class: System
+ class: Workload
+ type: System
component: Network
- type: Workload
os: linux freebsd
hosts: *
families: *
diff --git a/health/health.d/netfilter.conf b/health/health.d/netfilter.conf
index 35c89caf7..7de383fa2 100644
--- a/health/health.d/netfilter.conf
+++ b/health/health.d/netfilter.conf
@@ -3,9 +3,9 @@
alarm: netfilter_conntrack_full
on: netfilter.conntrack_sockets
- class: System
+ class: Workload
+ type: System
component: Network
- type: Workload
os: linux
hosts: *
lookup: max -10s unaligned of connections
diff --git a/health/health.d/nginx.conf b/health/health.d/nginx.conf
deleted file mode 100644
index 30c738f47..000000000
--- a/health/health.d/nginx.conf
+++ /dev/null
@@ -1,17 +0,0 @@
-
-# make sure nginx is running
-
- template: nginx_last_collected_secs
- on: nginx.requests
- class: Web Server
-component: NGINX
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
diff --git a/health/health.d/phpfpm.conf b/health/health.d/phpfpm.conf
deleted file mode 100644
index fc073a944..000000000
--- a/health/health.d/phpfpm.conf
+++ /dev/null
@@ -1,17 +0,0 @@
-
-# make sure phpfpm is running
-
- template: phpfpm_last_collected_secs
- on: phpfpm.requests
- class: Web Server
-component: PHP-FPM
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
index 72622caed..2e5c1cbfd 100644
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@@ -1,45 +1,12 @@
-# Make sure Pi-hole is responding.
-
- template: pihole_last_collected_secs
- on: pihole.dns_queries_total
- class: Ad Filtering
-component: Pi-hole
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
-# Blocked DNS queries.
-
- template: pihole_blocked_queries
- on: pihole.dns_queries_percentage
- class: Ad Filtering
-component: Pi-hole
- type: Errors
- every: 10s
- units: %
- calc: $blocked
- warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
- crit: $this > ( ($status == $CRITICAL) ? ( 55 ) : ( 75 ) )
- delay: up 2m down 5m
- info: percentage of blocked dns queries over the last 24 hour
- to: sysadmin
-
-
# Blocklist last update time.
# Default update interval is a week.
template: pihole_blocklist_last_update
on: pihole.blocklist_last_update
- class: Ad Filtering
+ class: Errors
+ type: Ad Filtering
component: Pi-hole
- type: Errors
every: 10s
units: seconds
calc: $ago
@@ -52,15 +19,15 @@ component: Pi-hole
template: pihole_blocklist_gravity_file
on: pihole.blocklist_last_update
- class: Ad Filtering
+ class: Errors
+ type: Ad Filtering
component: Pi-hole
- type: Errors
every: 10s
units: boolean
calc: $file_exists
crit: $this != 1
delay: up 2m down 5m
- info: gravity.list (blocklist) file existence state (0: exists, 1: not-exists)
+ info: gravity.list (blocklist) file existence state (0: not-exists, 1: exists)
to: sysadmin
# Pi-hole's ability to block unwanted domains.
@@ -68,13 +35,13 @@ component: Pi-hole
template: pihole_status
on: pihole.unwanted_domains_blocking_status
- class: Ad Filtering
+ class: Errors
+ type: Ad Filtering
component: Pi-hole
- type: Errors
every: 10s
units: boolean
calc: $enabled
warn: $this != 1
delay: up 2m down 5m
- info: unwanted domains blocking status (0: enabled, 1: disabled)
+ info: unwanted domains blocking status (0: disabled, 1: enabled)
to: sysadmin
diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf
index b977dbb31..8cbd7729c 100644
--- a/health/health.d/portcheck.conf
+++ b/health/health.d/portcheck.conf
@@ -1,25 +1,11 @@
- template: portcheck_last_collected_secs
- families: *
- on: portcheck.status
- class: Other
-component: TCP endpoint
- type: Latency
- calc: $now - $last_collected_t
- every: 10s
- units: seconds ago
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
template: portcheck_service_reachable
families: *
on: portcheck.status
- class: Other
+ class: Workload
+ type: Other
component: TCP endpoint
- type: Workload
lookup: average -1m unaligned percentage of success
calc: ($this < 75) ? (0) : ($this)
every: 5s
@@ -30,9 +16,9 @@ component: TCP endpoint
template: portcheck_connection_timeouts
families: *
on: portcheck.status
- class: Other
+ class: Errors
+ type: Other
component: TCP endpoint
- type: Errors
lookup: average -5m unaligned percentage of timeout
every: 10s
units: %
@@ -45,9 +31,9 @@ component: TCP endpoint
template: portcheck_connection_fails
families: *
on: portcheck.status
- class: Other
+ class: Errors
+ type: Other
component: TCP endpoint
- type: Errors
lookup: average -5m unaligned percentage of no_connection,failed
every: 10s
units: %
diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf
deleted file mode 100644
index f908a802a..000000000
--- a/health/health.d/postgres.conf
+++ /dev/null
@@ -1,16 +0,0 @@
-
-# make sure postgres is running
-
- template: postgres_last_collected_secs
- on: postgres.db_stat_transactions
- class: Database
-component: PostgreSQL
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf
index b44a24c0b..2929ee3d4 100644
--- a/health/health.d/processes.conf
+++ b/health/health.d/processes.conf
@@ -2,9 +2,9 @@
alarm: active_processes
on: system.active_processes
- class: System
+ class: Workload
+ type: System
component: Processes
- type: Workload
hosts: *
calc: $active * 100 / $pidmax
units: %
diff --git a/health/health.d/pulsar.conf b/health/health.d/pulsar.conf
deleted file mode 100644
index 9903d4e38..000000000
--- a/health/health.d/pulsar.conf
+++ /dev/null
@@ -1,16 +0,0 @@
-
-# Availability
-
- template: pulsar_last_collected_secs
- on: pulsar.broker_components
- class: Messaging
-component: Pulsar
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
diff --git a/health/health.d/nginx_plus.conf b/health/health.d/python.d.plugin.conf
index 5849a9e7e..f3abc588f 100644
--- a/health/health.d/nginx_plus.conf
+++ b/health/health.d/python.d.plugin.conf
@@ -1,11 +1,12 @@
-# make sure nginx_plus is running
+# make sure python.d.plugin data collection job is running
- template: nginx_plus_last_collected_secs
- on: nginx_plus.requests_total
- class: Web Server
-component: NGINX Plus
- type: Latency
+ template: python.d_job_last_collected_secs
+ on: netdata.pythond_runtime
+ class: Error
+ type: Netdata
+component: python.d.plugin
+ module: *
calc: $now - $last_collected_t
units: seconds ago
every: 10s
@@ -14,4 +15,3 @@ component: NGINX Plus
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
to: webmaster
-
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index 0e3cc29fa..6e6e3b400 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -3,9 +3,9 @@
alarm: used_ram_to_ignore
on: system.ram
- class: System
+ class: Utilization
+ type: System
component: Memory
- type: Utilization
os: linux freebsd
hosts: *
calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz - $zfs.arc_size.min)
@@ -15,13 +15,12 @@ component: Memory
alarm: ram_in_use
on: system.ram
- class: System
+ class: Utilization
+ type: System
component: Memory
- type: Utilization
os: linux
hosts: *
-# calc: $used * 100 / ($used + $cached + $free)
- calc: ($used - $used_ram_to_ignore) * 100 / ($used + $cached + $free)
+ calc: ($used - $used_ram_to_ignore) * 100 / ($used + $cached + $free + $buffers)
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (80) : (90))
@@ -32,12 +31,12 @@ component: Memory
alarm: ram_available
on: mem.available
- class: System
+ class: Utilization
+ type: System
component: Memory
- type: Utilization
os: linux
hosts: *
- calc: ($avail + $system.ram.used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
+ calc: $avail * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
units: %
every: 10s
warn: $this < (($status >= $WARNING) ? (15) : (10))
@@ -46,24 +45,25 @@ component: Memory
info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
to: sysadmin
- alarm: oom_kill
- on: mem.oom_kill
- os: linux
- hosts: *
- lookup: sum -1m unaligned
- units: kills
- every: 10s
- warn: $this > 0
- delay: down 5m
- info: number of out of memory kills in the last minute
- to: sysadmin
+ alarm: oom_kill
+ on: mem.oom_kill
+ os: linux
+ hosts: *
+ lookup: sum -30m unaligned
+ units: kills
+ every: 5m
+ warn: $this > 0
+ delay: down 10m
+host labels: _is_k8s_node = false
+ info: number of out of memory kills in the last 30 minutes
+ to: sysadmin
## FreeBSD
alarm: ram_in_use
on: system.ram
- class: System
+ class: Utilization
+ type: System
component: Memory
- type: Utilization
os: freebsd
hosts: *
calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
@@ -77,9 +77,9 @@ component: Memory
alarm: ram_available
on: system.ram
- class: System
+ class: Utilization
+ type: System
component: Memory
- type: Utilization
os: freebsd
hosts: *
calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf
index e8b289942..dfb771e8c 100644
--- a/health/health.d/redis.conf
+++ b/health/health.d/redis.conf
@@ -1,26 +1,10 @@
-# make sure redis is running
-
- template: redis_last_collected_secs
- on: redis.operations
- class: KV Storage
-component: Redis
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
-
template: redis_bgsave_broken
families: *
on: redis.bgsave_health
- class: KV Storage
+ class: Errors
+ type: KV Storage
component: Redis
- type: Errors
every: 10s
crit: $rdb_last_bgsave_status != 0
units: ok/failed
@@ -31,9 +15,9 @@ component: Redis
template: redis_bgsave_slow
families: *
on: redis.bgsave_now
- class: KV Storage
+ class: Latency
+ type: KV Storage
component: Redis
- type: Latency
every: 10s
warn: $rdb_bgsave_in_progress > 600
crit: $rdb_bgsave_in_progress > 1200
diff --git a/health/health.d/retroshare.conf b/health/health.d/retroshare.conf
index ca22e60de..14aa76b4c 100644
--- a/health/health.d/retroshare.conf
+++ b/health/health.d/retroshare.conf
@@ -1,26 +1,11 @@
-# make sure RetroShare is running
-
- template: retroshare_last_collected_secs
- on: retroshare.peers
- class: Data Sharing
-component: Retroshare
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
# make sure the DHT is fine when active
template: retroshare_dht_working
on: retroshare.dht
- class: Data Sharing
+ class: Utilization
+ type: Data Sharing
component: Retroshare
- type: Utilization
calc: $dht_size_all
units: peers
every: 1m
diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf
index b2c0e8d9c..261fd48c6 100644
--- a/health/health.d/riakkv.conf
+++ b/health/health.d/riakkv.conf
@@ -1,24 +1,10 @@
-# Ensure that Riak is running. template: riak_last_collected_secs
- template: riakkv_last_collected_secs
- on: riak.kv.throughput
- class: Database
-component: Riak KV
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
# Warn if a list keys operation is running.
template: riakkv_list_keys_active
on: riak.core.fsm_active
- class: Database
+ class: Utilization
+ type: Database
component: Riak KV
- type: Utilization
calc: $list_fsm_active
units: state machines
every: 10s
@@ -31,9 +17,9 @@ component: Riak KV
# KV GET
template: riakkv_1h_kv_get_mean_latency
on: riak.kv.latency.get
- class: Database
+ class: Latency
+ type: Database
component: Riak KV
- type: Latency
calc: $node_get_fsm_time_mean
lookup: average -1h unaligned of time
every: 30s
@@ -43,9 +29,9 @@ component: Riak KV
template: riakkv_kv_get_slow
on: riak.kv.latency.get
- class: Database
+ class: Latency
+ type: Database
component: Riak KV
- type: Latency
calc: $mean
lookup: average -3m unaligned of time
units: ms
@@ -61,9 +47,9 @@ component: Riak KV
# KV PUT
template: riakkv_1h_kv_put_mean_latency
on: riak.kv.latency.put
- class: Database
+ class: Latency
+ type: Database
component: Riak KV
- type: Latency
calc: $node_put_fsm_time_mean
lookup: average -1h unaligned of time
every: 30s
@@ -73,9 +59,9 @@ component: Riak KV
template: riakkv_kv_put_slow
on: riak.kv.latency.put
- class: Database
+ class: Latency
+ type: Database
component: Riak KV
- type: Latency
calc: $mean
lookup: average -3m unaligned of time
units: ms
@@ -95,9 +81,9 @@ component: Riak KV
# On systems observed, this is < 2000, but may grow depending on load.
template: riakkv_vm_high_process_count
on: riak.vm
- class: Database
+ class: Utilization
+ type: Database
component: Riak KV
- type: Utilization
calc: $sys_process_count
units: processes
every: 10s
diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf
index 3c0dc1168..ab110bf07 100644
--- a/health/health.d/scaleio.conf
+++ b/health/health.d/scaleio.conf
@@ -1,27 +1,11 @@
-# make sure scaleio is running
-
- template: scaleio_last_collected_secs
- on: scaleio.system_capacity_total
- class: Storage
-component: ScaleIO
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
# make sure Storage Pool capacity utilization is under limit
template: scaleio_storage_pool_capacity_utilization
on: scaleio.storage_pool_capacity_utilization
- class: Storage
+ class: Utilization
+ type: Storage
component: ScaleIO
- type: Utilization
calc: $used
units: %
every: 10s
@@ -36,9 +20,9 @@ component: ScaleIO
template: scaleio_sdc_mdm_connection_state
on: scaleio.sdc_mdm_connection_state
- class: Storage
+ class: Utilization
+ type: Storage
component: ScaleIO
- type: Utilization
calc: $connected
every: 10s
warn: $this != 1
diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf
index d8b01caff..345f87505 100644
--- a/health/health.d/softnet.conf
+++ b/health/health.d/softnet.conf
@@ -5,9 +5,9 @@
alarm: 1min_netdev_backlog_exceeded
on: system.softnet_stat
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
lookup: average -1m unaligned absolute of dropped
@@ -21,9 +21,9 @@ component: Network
alarm: 1min_netdev_budget_ran_outs
on: system.softnet_stat
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
lookup: average -1m unaligned absolute of squeezed
@@ -38,9 +38,9 @@ component: Network
alarm: 10min_netisr_backlog_exceeded
on: system.softnet_stat
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: freebsd
hosts: *
lookup: average -1m unaligned absolute of qdrops
diff --git a/health/health.d/squid.conf b/health/health.d/squid.conf
deleted file mode 100644
index 5c3d17629..000000000
--- a/health/health.d/squid.conf
+++ /dev/null
@@ -1,17 +0,0 @@
-
-# make sure squid is running
-
- template: squid_last_collected_secs
- on: squid.clients_requests
- class: Web Proxy
-component: Squid
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: proxyadmin
-
diff --git a/health/health.d/stiebeleltron.conf b/health/health.d/stiebeleltron.conf
index f793b5ed1..493c8b73a 100644
--- a/health/health.d/stiebeleltron.conf
+++ b/health/health.d/stiebeleltron.conf
@@ -1,9 +1,9 @@
template: stiebeleltron_last_collected_secs
families: *
on: stiebeleltron.heating.hc1
- class: Other
+ class: Latency
+ type: Other
component: Sensors
- type: Latency
calc: $now - $last_collected_t
every: 10s
units: seconds ago
diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf
index 5b3f89a97..03c319320 100644
--- a/health/health.d/swap.conf
+++ b/health/health.d/swap.conf
@@ -3,9 +3,9 @@
alarm: 30min_ram_swapped_out
on: system.swapio
- class: System
+ class: Workload
+ type: System
component: Memory
- type: Workload
os: linux freebsd
hosts: *
lookup: sum -30m unaligned absolute of out
@@ -20,12 +20,12 @@ component: Memory
alarm: used_swap
on: system.swap
- class: System
+ class: Utilization
+ type: System
component: Memory
- type: Utilization
os: linux freebsd
hosts: *
- calc: $used * 100 / ( $used + $free )
+ calc: ($used + $free) > 0 ? ($used * 100 / ($used + $free)) : 0
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (80) : (90))
diff --git a/health/health.d/systemdunits.conf b/health/health.d/systemdunits.conf
index cc1a8698d..38213a8db 100644
--- a/health/health.d/systemdunits.conf
+++ b/health/health.d/systemdunits.conf
@@ -4,9 +4,9 @@
## Service units
template: systemd_service_units_state
on: systemd.service_units_state
- class: Linux
+ class: Errors
+ type: Linux
component: Systemd units
- type: Errors
lookup: max -1s min2max
units: ok/failed
every: 10s
@@ -18,9 +18,9 @@ component: Systemd units
## Socket units
template: systemd_socket_units_state
on: systemd.socket_unit_state
- class: Linux
+ class: Errors
+ type: Linux
component: Systemd units
- type: Errors
lookup: max -1s min2max
units: ok/failed
every: 10s
@@ -32,9 +32,9 @@ component: Systemd units
## Target units
template: systemd_target_units_state
on: systemd.target_unit_state
- class: Linux
+ class: Errors
+ type: Linux
component: Systemd units
- type: Errors
lookup: max -1s min2max
units: ok/failed
every: 10s
@@ -46,9 +46,9 @@ component: Systemd units
## Path units
template: systemd_path_units_state
on: systemd.path_unit_state
- class: Linux
+ class: Errors
+ type: Linux
component: Systemd units
- type: Errors
lookup: max -1s min2max
units: ok/failed
every: 10s
@@ -60,9 +60,9 @@ component: Systemd units
## Device units
template: systemd_device_units_state
on: systemd.device_unit_state
- class: Linux
+ class: Errors
+ type: Linux
component: Systemd units
- type: Errors
lookup: max -1s min2max
units: ok/failed
every: 10s
@@ -74,9 +74,9 @@ component: Systemd units
## Mount units
template: systemd_mount_units_state
on: systemd.mount_unit_state
- class: Linux
+ class: Errors
+ type: Linux
component: Systemd units
- type: Errors
lookup: max -1s min2max
units: ok/failed
every: 10s
@@ -88,9 +88,9 @@ component: Systemd units
## Automount units
template: systemd_automount_units_state
on: systemd.automount_unit_state
- class: Linux
+ class: Errors
+ type: Linux
component: Systemd units
- type: Errors
lookup: max -1s min2max
units: ok/failed
every: 10s
@@ -102,9 +102,9 @@ component: Systemd units
## Swap units
template: systemd_swap_units_state
on: systemd.swap_unit_state
- class: Linux
+ class: Errors
+ type: Linux
component: Systemd units
- type: Errors
lookup: max -1s min2max
units: ok/failed
every: 10s
@@ -116,9 +116,9 @@ component: Systemd units
## Scope units
template: systemd_scope_units_state
on: systemd.scope_unit_state
- class: Linux
+ class: Errors
+ type: Linux
component: Systemd units
- type: Errors
lookup: max -1s min2max
units: ok/failed
every: 10s
@@ -130,9 +130,9 @@ component: Systemd units
## Slice units
template: systemd_slice_units_state
on: systemd.slice_unit_state
- class: Linux
+ class: Errors
+ type: Linux
component: Systemd units
- type: Errors
lookup: max -1s min2max
units: ok/failed
every: 10s
diff --git a/health/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf
index f2c5e4e5d..67b3bee53 100644
--- a/health/health.d/tcp_conn.conf
+++ b/health/health.d/tcp_conn.conf
@@ -7,9 +7,9 @@
alarm: tcp_connections
on: ipv4.tcpsock
- class: System
+ class: Workload
+ type: System
component: Network
- type: Workload
os: linux
hosts: *
calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0
diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf
index 51a0e461c..d4bcfa248 100644
--- a/health/health.d/tcp_listen.conf
+++ b/health/health.d/tcp_listen.conf
@@ -20,9 +20,9 @@
alarm: 1m_tcp_accept_queue_overflows
on: ip.tcp_accept_queue
- class: System
+ class: Workload
+ type: System
component: Network
- type: Workload
os: linux
hosts: *
lookup: average -60s unaligned absolute of ListenOverflows
@@ -38,9 +38,9 @@ component: Network
# CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842
alarm: 1m_tcp_accept_queue_drops
on: ip.tcp_accept_queue
- class: System
+ class: Workload
+ type: System
component: Network
- type: Workload
os: linux
hosts: *
lookup: average -60s unaligned absolute of ListenDrops
@@ -63,9 +63,9 @@ component: Network
alarm: 1m_tcp_syn_queue_drops
on: ip.tcp_syn_queue
- class: System
+ class: Workload
+ type: System
component: Network
- type: Workload
os: linux
hosts: *
lookup: average -60s unaligned absolute of TCPReqQFullDrop
@@ -80,9 +80,9 @@ component: Network
alarm: 1m_tcp_syn_queue_cookies
on: ip.tcp_syn_queue
- class: System
+ class: Workload
+ type: System
component: Network
- type: Workload
os: linux
hosts: *
lookup: average -60s unaligned absolute of TCPReqQFullDoCookies
diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf
index 646e5c6da..318be20ac 100644
--- a/health/health.d/tcp_mem.conf
+++ b/health/health.d/tcp_mem.conf
@@ -8,9 +8,9 @@
alarm: tcp_memory
on: ipv4.sockstat_tcp_mem
- class: System
+ class: Utilization
+ type: System
component: Network
- type: Utilization
os: linux
hosts: *
calc: ${mem} * 100 / ${tcp_mem_high}
diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf
index 6e94d67d1..cbd628da5 100644
--- a/health/health.d/tcp_orphans.conf
+++ b/health/health.d/tcp_orphans.conf
@@ -9,9 +9,9 @@
alarm: tcp_orphans
on: ipv4.sockstat_tcp_sockets
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
calc: ${orphan} * 100 / ${tcp_max_orphans}
diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf
index 41355dad6..190271e47 100644
--- a/health/health.d/tcp_resets.conf
+++ b/health/health.d/tcp_resets.conf
@@ -6,9 +6,9 @@
alarm: 1m_ipv4_tcp_resets_sent
on: ipv4.tcphandshake
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
lookup: average -1m at -10s unaligned absolute of OutRsts
@@ -18,9 +18,9 @@ component: Network
alarm: 10s_ipv4_tcp_resets_sent
on: ipv4.tcphandshake
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
lookup: average -10s unaligned absolute of OutRsts
@@ -40,9 +40,9 @@ component: Network
alarm: 1m_ipv4_tcp_resets_received
on: ipv4.tcphandshake
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux freebsd
hosts: *
lookup: average -1m at -10s unaligned absolute of AttemptFails
@@ -52,9 +52,9 @@ component: Network
alarm: 10s_ipv4_tcp_resets_received
on: ipv4.tcphandshake
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux freebsd
hosts: *
lookup: average -10s unaligned absolute of AttemptFails
diff --git a/health/health.d/timex.conf b/health/health.d/timex.conf
new file mode 100644
index 000000000..ea90c4000
--- /dev/null
+++ b/health/health.d/timex.conf
@@ -0,0 +1,17 @@
+
+# It can take several minutes before ntpd selects a server to synchronize with;
+# try checking after 17 minutes (1024 seconds).
+
+ alarm: system_clock_sync_state
+ on: system.clock_sync_state
+ os: linux
+ class: Error
+ type: System
+component: Clock
+ calc: $state
+ units: synchronization state
+ every: 10s
+ warn: $system.uptime.uptime > 17 * 60 AND $this == 0
+ delay: down 5m
+ info: the system time is not synchronized to a reliable server
+ to: silent
diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
index 342a1aedd..64f47dfa7 100644
--- a/health/health.d/udp_errors.conf
+++ b/health/health.d/udp_errors.conf
@@ -6,9 +6,9 @@
alarm: 1m_ipv4_udp_receive_buffer_errors
on: ipv4.udperrors
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux freebsd
hosts: *
lookup: average -1m unaligned absolute of RcvbufErrors
@@ -24,9 +24,9 @@ component: Network
alarm: 1m_ipv4_udp_send_buffer_errors
on: ipv4.udperrors
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
lookup: average -1m unaligned absolute of SndbufErrors
diff --git a/health/health.d/unbound.conf b/health/health.d/unbound.conf
index 1df15474f..4e8d164d2 100644
--- a/health/health.d/unbound.conf
+++ b/health/health.d/unbound.conf
@@ -1,27 +1,11 @@
-# make sure unbound is running
-
- template: unbound_last_collected_secs
- on: unbound.queries
- class: DNS
-component: Unbound
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
# make sure there is no overwritten/dropped queries in the request-list
template: unbound_request_list_overwritten
on: unbound.request_list_jostle_list
- class: DNS
+ class: Errors
+ type: DNS
component: Unbound
- type: Errors
lookup: average -60s unaligned absolute match-names of overwritten
units: queries
every: 10s
@@ -32,9 +16,9 @@ component: Unbound
template: unbound_request_list_dropped
on: unbound.request_list_jostle_list
- class: DNS
+ class: Errors
+ type: DNS
component: Unbound
- type: Errors
lookup: average -60s unaligned absolute match-names of dropped
units: queries
every: 10s
diff --git a/health/health.d/varnish.conf b/health/health.d/varnish.conf
deleted file mode 100644
index 7f3bd6c82..000000000
--- a/health/health.d/varnish.conf
+++ /dev/null
@@ -1,12 +0,0 @@
- alarm: varnish_last_collected
- on: varnish.uptime
- class: Web Proxy
-component: Varnish
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- info: number of seconds since the last successful data collection
- to: sysadmin
diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf
index 8538e488c..a9cc7ceef 100644
--- a/health/health.d/vcsa.conf
+++ b/health/health.d/vcsa.conf
@@ -1,20 +1,4 @@
-# make sure vcsa is running and responding
-
- template: vcsa_last_collected_secs
- on: vcsa.system_health
- class: Virtual Machine
-component: VMware vCenter
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
# Overall system health:
# - 0: all components are healthy.
# - 1: one or more components might become overloaded soon.
@@ -24,9 +8,9 @@ component: VMware vCenter
template: vcsa_system_health
on: vcsa.system_health
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: VMware vCenter
- type: Errors
lookup: max -10s unaligned of system
units: status
every: 10s
@@ -46,9 +30,9 @@ component: VMware vCenter
template: vcsa_swap_health
on: vcsa.components_health
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: VMware vCenter
- type: Errors
lookup: max -10s unaligned of swap
units: status
every: 10s
@@ -61,9 +45,9 @@ component: VMware vCenter
template: vcsa_storage_health
on: vcsa.components_health
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: VMware vCenter
- type: Errors
lookup: max -10s unaligned of storage
units: status
every: 10s
@@ -76,9 +60,9 @@ component: VMware vCenter
template: vcsa_mem_health
on: vcsa.components_health
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: VMware vCenter
- type: Errors
lookup: max -10s unaligned of mem
units: status
every: 10s
@@ -91,9 +75,9 @@ component: VMware vCenter
template: vcsa_load_health
on: vcsa.components_health
- class: Virtual Machine
+ class: Utilization
+ type: Virtual Machine
component: VMware vCenter
- type: Utilization
lookup: max -10s unaligned of load
units: status
every: 10s
@@ -106,9 +90,9 @@ component: VMware vCenter
template: vcsa_database_storage_health
on: vcsa.components_health
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: VMware vCenter
- type: Errors
lookup: max -10s unaligned of database_storage
units: status
every: 10s
@@ -121,9 +105,9 @@ component: VMware vCenter
template: vcsa_applmgmt_health
on: vcsa.components_health
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: VMware vCenter
- type: Errors
lookup: max -10s unaligned of applmgmt
units: status
every: 10s
@@ -143,9 +127,9 @@ component: VMware vCenter
template: vcsa_software_updates_health
on: vcsa.software_updates_health
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: VMware vCenter
- type: Errors
lookup: max -10s unaligned of software_packages
units: status
every: 10s
diff --git a/health/health.d/vernemq.conf b/health/health.d/vernemq.conf
index 737147f38..cfbe2a524 100644
--- a/health/health.d/vernemq.conf
+++ b/health/health.d/vernemq.conf
@@ -1,27 +1,11 @@
-# Availability
-
- template: vernemq_last_collected_secs
- on: vernemq.node_uptime
- class: Messaging
-component: VerneMQ
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
# Socket errors
template: vernemq_socket_errors
on: vernemq.socket_errors
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: sum -1m unaligned absolute of socket_error
units: errors
every: 1m
@@ -34,9 +18,9 @@ component: VerneMQ
template: vernemq_queue_message_drop
on: vernemq.queue_undelivered_messages
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute of queue_message_drop
units: dropped messages
every: 1m
@@ -47,9 +31,9 @@ component: VerneMQ
template: vernemq_queue_message_expired
on: vernemq.queue_undelivered_messages
- class: Messaging
+ class: Latency
+ type: Messaging
component: VerneMQ
- type: Latency
lookup: average -1m unaligned absolute of queue_message_expired
units: expired messages
every: 1m
@@ -60,9 +44,9 @@ component: VerneMQ
template: vernemq_queue_message_unhandled
on: vernemq.queue_undelivered_messages
- class: Messaging
+ class: Latency
+ type: Messaging
component: VerneMQ
- type: Latency
lookup: average -1m unaligned absolute of queue_message_unhandled
units: unhandled messages
every: 1m
@@ -75,9 +59,9 @@ component: VerneMQ
template: vernemq_average_scheduler_utilization
on: vernemq.average_scheduler_utilization
- class: Messaging
+ class: Utilization
+ type: Messaging
component: VerneMQ
- type: Utilization
lookup: average -10m unaligned
units: %
every: 1m
@@ -91,9 +75,9 @@ component: VerneMQ
template: vernemq_cluster_dropped
on: vernemq.cluster_dropped
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: sum -1m unaligned
units: KiB
every: 1m
@@ -104,9 +88,9 @@ component: VerneMQ
template: vernemq_netsplits
on: vernemq.netsplits
- class: Messaging
+ class: Workload
+ type: Messaging
component: VerneMQ
- type: Workload
lookup: sum -1m unaligned absolute of netsplit_detected
units: netsplits
every: 10s
@@ -119,9 +103,9 @@ component: VerneMQ
template: vernemq_mqtt_connack_sent_reason_unsuccessful
on: vernemq.mqtt_connack_sent_reason
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute match-names of !success,*
units: packets
every: 1m
@@ -134,9 +118,9 @@ component: VerneMQ
template: vernemq_mqtt_disconnect_received_reason_not_normal
on: vernemq.mqtt_disconnect_received_reason
- class: Messaging
+ class: Workload
+ type: Messaging
component: VerneMQ
- type: Workload
lookup: average -1m unaligned absolute match-names of !normal_disconnect,*
units: packets
every: 1m
@@ -147,9 +131,9 @@ component: VerneMQ
template: vernemq_mqtt_disconnect_sent_reason_not_normal
on: vernemq.mqtt_disconnect_sent_reason
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute match-names of !normal_disconnect,*
units: packets
every: 1m
@@ -162,9 +146,9 @@ component: VerneMQ
template: vernemq_mqtt_subscribe_error
on: vernemq.mqtt_subscribe_error
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute
units: failed ops
every: 1m
@@ -175,9 +159,9 @@ component: VerneMQ
template: vernemq_mqtt_subscribe_auth_error
on: vernemq.mqtt_subscribe_auth_error
- class: Messaging
+ class: Workload
+ type: Messaging
component: VerneMQ
- type: Workload
lookup: average -1m unaligned absolute
units: attempts
every: 1m
@@ -190,9 +174,9 @@ component: VerneMQ
template: vernemq_mqtt_unsubscribe_error
on: vernemq.mqtt_unsubscribe_error
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute
units: failed ops
every: 1m
@@ -205,9 +189,9 @@ component: VerneMQ
template: vernemq_mqtt_publish_errors
on: vernemq.mqtt_publish_errors
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute
units: failed ops
every: 1m
@@ -218,9 +202,9 @@ component: VerneMQ
template: vernemq_mqtt_publish_auth_errors
on: vernemq.mqtt_publish_auth_errors
- class: Messaging
+ class: Workload
+ type: Messaging
component: VerneMQ
- type: Workload
lookup: average -1m unaligned absolute
units: attempts
every: 1m
@@ -233,9 +217,9 @@ component: VerneMQ
template: vernemq_mqtt_puback_received_reason_unsuccessful
on: vernemq.mqtt_puback_received_reason
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute match-names of !success,*
units: packets
every: 1m
@@ -246,9 +230,9 @@ component: VerneMQ
template: vernemq_mqtt_puback_sent_reason_unsuccessful
on: vernemq.mqtt_puback_sent_reason
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute match-names of !success,*
units: packets
every: 1m
@@ -259,9 +243,9 @@ component: VerneMQ
template: vernemq_mqtt_puback_unexpected
on: vernemq.mqtt_puback_invalid_error
- class: Messaging
+ class: Workload
+ type: Messaging
component: VerneMQ
- type: Workload
lookup: average -1m unaligned absolute
units: messages
every: 1m
@@ -274,9 +258,9 @@ component: VerneMQ
template: vernemq_mqtt_pubrec_received_reason_unsuccessful
on: vernemq.mqtt_pubrec_received_reason
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute match-names of !success,*
units: packets
every: 1m
@@ -287,9 +271,9 @@ component: VerneMQ
template: vernemq_mqtt_pubrec_sent_reason_unsuccessful
on: vernemq.mqtt_pubrec_sent_reason
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute match-names of !success,*
units: packets
every: 1m
@@ -300,9 +284,9 @@ component: VerneMQ
template: vernemq_mqtt_pubrec_invalid_error
on: vernemq.mqtt_pubrec_invalid_error
- class: Messaging
+ class: Workload
+ type: Messaging
component: VerneMQ
- type: Workload
lookup: average -1m unaligned absolute
units: messages
every: 1m
@@ -315,9 +299,9 @@ component: VerneMQ
template: vernemq_mqtt_pubrel_received_reason_unsuccessful
on: vernemq.mqtt_pubrel_received_reason
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute match-names of !success,*
units: packets
every: 1m
@@ -328,9 +312,9 @@ component: VerneMQ
template: vernemq_mqtt_pubrel_sent_reason_unsuccessful
on: vernemq.mqtt_pubrel_sent_reason
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute match-names of !success,*
units: packets
every: 1m
@@ -343,9 +327,9 @@ component: VerneMQ
template: vernemq_mqtt_pubcomp_received_reason_unsuccessful
on: vernemq.mqtt_pubcomp_received_reason
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute match-names of !success,*
units: packets
every: 1m
@@ -356,9 +340,9 @@ component: VerneMQ
template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful
on: vernemq.mqtt_pubcomp_sent_reason
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute match-names of !success,*
units: packets
every: 1m
@@ -369,9 +353,9 @@ component: VerneMQ
template: vernemq_mqtt_pubcomp_unexpected
on: vernemq.mqtt_pubcomp_invalid_error
- class: Messaging
+ class: Workload
+ type: Messaging
component: VerneMQ
- type: Workload
lookup: average -1m unaligned absolute
units: messages
every: 1m
diff --git a/health/health.d/vsphere.conf b/health/health.d/vsphere.conf
index aee7c5cd4..d8fc899b9 100644
--- a/health/health.d/vsphere.conf
+++ b/health/health.d/vsphere.conf
@@ -6,9 +6,9 @@
template: vsphere_vm_mem_usage
on: vsphere.vm_mem_usage_percentage
- class: Virtual Machine
+ class: Utilization
+ type: Virtual Machine
component: Memory
- type: Utilization
hosts: *
calc: $used
units: %
@@ -23,9 +23,9 @@ component: Memory
template: vsphere_host_mem_usage
on: vsphere.host_mem_usage_percentage
- class: Virtual Machine
+ class: Utilization
+ type: Virtual Machine
component: Memory
- type: Utilization
hosts: *
calc: $used
units: %
@@ -39,9 +39,9 @@ component: Memory
template: vsphere_inbound_packets_errors
on: vsphere.net_errors_total
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: Network
- type: Errors
hosts: *
families: *
lookup: sum -10m unaligned absolute match-names of rx
@@ -51,9 +51,9 @@ component: Network
template: vsphere_outbound_packets_errors
on: vsphere.net_errors_total
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: Network
- type: Errors
hosts: *
families: *
lookup: sum -10m unaligned absolute match-names of tx
@@ -65,9 +65,9 @@ component: Network
template: vsphere_inbound_packets_errors_ratio
on: vsphere.net_packets_total
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: Network
- type: Errors
hosts: *
families: *
lookup: sum -10m unaligned absolute match-names of rx
@@ -81,9 +81,9 @@ component: Network
template: vsphere_outbound_packets_errors_ratio
on: vsphere.net_packets_total
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: Network
- type: Errors
hosts: *
families: *
lookup: sum -10m unaligned absolute match-names of tx
@@ -100,9 +100,9 @@ component: Network
template: vsphere_cpu_usage
on: vsphere.cpu_usage_total
- class: Virtual Machine
+ class: Utilization
+ type: Virtual Machine
component: CPU
- type: Utilization
hosts: *
lookup: average -10m unaligned match-names of used
units: %
@@ -117,9 +117,9 @@ component: CPU
template: vsphere_inbound_packets_dropped
on: vsphere.net_drops_total
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: Network
- type: Errors
hosts: *
families: *
lookup: sum -10m unaligned absolute match-names of rx
@@ -129,9 +129,9 @@ component: Network
template: vsphere_outbound_packets_dropped
on: vsphere.net_drops_total
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: Network
- type: Errors
hosts: *
families: *
lookup: sum -10m unaligned absolute match-names of tx
@@ -143,9 +143,9 @@ component: Network
template: vsphere_inbound_packets_dropped_ratio
on: vsphere.net_packets_total
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: Network
- type: Errors
hosts: *
families: *
lookup: sum -10m unaligned absolute match-names of rx
@@ -159,9 +159,9 @@ component: Network
template: vsphere_outbound_packets_dropped_ratio
on: vsphere.net_packets_total
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: Network
- type: Errors
hosts: *
families: *
lookup: sum -10m unaligned absolute match-names of tx
diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf
index 127c9a9c6..454e0abef 100644
--- a/health/health.d/web_log.conf
+++ b/health/health.d/web_log.conf
@@ -1,22 +1,4 @@
-# make sure we can collect web log data
-
- template: last_collected_secs
- on: web_log.response_codes
- class: Web Server
-component: Web log
- type: Latency
- families: *
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
-
# -----------------------------------------------------------------------------
# high level response code alarms
@@ -29,9 +11,9 @@ component: Web log
template: 1m_requests
on: web_log.response_statuses
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: sum -1m unaligned
calc: ($this == 0)?(1):($this)
@@ -41,9 +23,9 @@ component: Web log
template: 1m_successful
on: web_log.response_statuses
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: sum -1m unaligned of successful_requests
calc: $this * 100 / $1m_requests
@@ -57,41 +39,39 @@ component: Web log
template: 1m_redirects
on: web_log.response_statuses
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: sum -1m unaligned of redirects
calc: $this * 100 / $1m_requests
units: %
every: 10s
warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
- crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
info: ratio of redirection HTTP requests over the last minute (3xx except 304)
to: webmaster
template: 1m_bad_requests
on: web_log.response_statuses
- class: Web Server
+ class: Errors
+ type: Web Server
component: Web log
- type: Errors
families: *
lookup: sum -1m unaligned of bad_requests
calc: $this * 100 / $1m_requests
units: %
every: 10s
warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
- crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
info: ratio of client error HTTP requests over the last minute (4xx except 401)
to: webmaster
template: 1m_internal_errors
on: web_log.response_statuses
- class: Web Server
+ class: Errors
+ type: Web Server
component: Web log
- type: Errors
families: *
lookup: sum -1m unaligned of server_errors
calc: $this * 100 / $1m_requests
@@ -114,9 +94,9 @@ component: Web log
template: 1m_total_requests
on: web_log.response_codes
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: sum -1m unaligned
calc: ($this == 0)?(1):($this)
@@ -126,9 +106,9 @@ component: Web log
template: 1m_unmatched
on: web_log.response_codes
- class: Web Server
+ class: Errors
+ type: Web Server
component: Web log
- type: Errors
families: *
lookup: sum -1m unaligned of unmatched
calc: $this * 100 / $1m_total_requests
@@ -151,9 +131,9 @@ component: Web log
template: 10m_response_time
on: web_log.response_time
- class: System
+ class: Latency
+ type: System
component: Web log
- type: Latency
families: *
lookup: average -10m unaligned of avg
units: ms
@@ -162,9 +142,9 @@ component: Web log
template: web_slow
on: web_log.response_time
- class: Web Server
+ class: Latency
+ type: Web Server
component: Web log
- type: Latency
families: *
lookup: average -1m unaligned of avg
units: ms
@@ -191,9 +171,9 @@ component: Web log
template: 5m_successful_old
on: web_log.response_statuses
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: average -5m at -5m unaligned of successful_requests
units: requests/s
@@ -202,9 +182,9 @@ component: Web log
template: 5m_successful
on: web_log.response_statuses
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: average -5m unaligned of successful_requests
units: requests/s
@@ -213,9 +193,9 @@ component: Web log
template: 5m_requests_ratio
on: web_log.response_codes
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100)
units: %
@@ -233,23 +213,6 @@ component: Web log
# ---------------------------------------------------GO-VERSION---------------------------------------------------------
-# make sure we can collect web log data
-
- template: web_log_last_collected_secs
- on: web_log.requests
- class: Web Server
-component: Web log
- type: Latency
- families: *
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
# unmatched lines
# the following alarms trigger only when there are enough data.
@@ -261,9 +224,9 @@ component: Web log
template: web_log_1m_total_requests
on: web_log.requests
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: sum -1m unaligned
calc: ($this == 0)?(1):($this)
@@ -273,9 +236,9 @@ component: Web log
template: web_log_1m_unmatched
on: web_log.excluded_requests
- class: Web Server
+ class: Errors
+ type: Web Server
component: Web log
- type: Errors
families: *
lookup: sum -1m unaligned of unmatched
calc: $this * 100 / $web_log_1m_total_requests
@@ -298,9 +261,9 @@ component: Web log
template: web_log_1m_requests
on: web_log.type_requests
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: sum -1m unaligned
calc: ($this == 0)?(1):($this)
@@ -310,9 +273,9 @@ component: Web log
template: web_log_1m_successful
on: web_log.type_requests
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: sum -1m unaligned of success
calc: $this * 100 / $web_log_1m_requests
@@ -326,41 +289,39 @@ component: Web log
template: web_log_1m_redirects
on: web_log.type_requests
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: sum -1m unaligned of redirect
calc: $this * 100 / $web_log_1m_requests
units: %
every: 10s
warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
- crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
info: ratio of redirection HTTP requests over the last minute (3xx except 304)
to: webmaster
template: web_log_1m_bad_requests
on: web_log.type_requests
- class: Web Server
+ class: Errors
+ type: Web Server
component: Web log
- type: Errors
families: *
lookup: sum -1m unaligned of bad
calc: $this * 100 / $web_log_1m_requests
units: %
every: 10s
warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
- crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
info: ratio of client error HTTP requests over the last minute (4xx except 401)
to: webmaster
template: web_log_1m_internal_errors
on: web_log.type_requests
- class: Web Server
+ class: Errors
+ type: Web Server
component: Web log
- type: Errors
families: *
lookup: sum -1m unaligned of error
calc: $this * 100 / $web_log_1m_requests
@@ -384,9 +345,9 @@ component: Web log
template: web_log_10m_response_time
on: web_log.request_processing_time
- class: System
+ class: Latency
+ type: System
component: Web log
- type: Latency
families: *
lookup: average -10m unaligned of avg
units: ms
@@ -395,9 +356,9 @@ component: Web log
template: web_log_web_slow
on: web_log.request_processing_time
- class: Web Server
+ class: Latency
+ type: Web Server
component: Web log
- type: Latency
families: *
lookup: average -1m unaligned of avg
units: ms
@@ -424,9 +385,9 @@ component: Web log
template: web_log_5m_successful_old
on: web_log.type_requests
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: average -5m at -5m unaligned of success
units: requests/s
@@ -435,9 +396,9 @@ component: Web log
template: web_log_5m_successful
on: web_log.type_requests
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: average -5m unaligned of success
units: requests/s
@@ -446,9 +407,9 @@ component: Web log
template: web_log_5m_requests_ratio
on: web_log.type_requests
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
calc: ($web_log_5m_successful_old > 0)?($web_log_5m_successful * 100 / $web_log_5m_successful_old):(100)
units: %
diff --git a/health/health.d/whoisquery.conf b/health/health.d/whoisquery.conf
index c6d3a9de0..be5eb58f9 100644
--- a/health/health.d/whoisquery.conf
+++ b/health/health.d/whoisquery.conf
@@ -1,26 +1,9 @@
-# make sure whoisquery is running
-
- template: whoisquery_last_collected_secs
- on: whoisquery.time_until_expiration
- class: Other
-component: WHOIS
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 60s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
-
template: whoisquery_days_until_expiration
on: whoisquery.time_until_expiration
- class: Other
+ class: Utilization
+ type: Other
component: WHOIS
- type: Utilization
calc: $expiry
units: seconds
every: 60s
diff --git a/health/health.d/wmi.conf b/health/health.d/wmi.conf
index 6bd4e077f..90d39ce9d 100644
--- a/health/health.d/wmi.conf
+++ b/health/health.d/wmi.conf
@@ -1,29 +1,11 @@
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-## Availability
-
- template: wmi_last_collected_secs
- on: cpu.collector_duration
- class: Windows
-component: Availability
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
## CPU
template: wmi_10min_cpu_usage
on: wmi.cpu_utilization_total
- class: Windows
+ class: Utilization
+ type: Windows
component: CPU
- type: Utilization
os: linux
hosts: *
lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt
@@ -40,9 +22,9 @@ component: CPU
template: wmi_ram_in_use
on: wmi.memory_utilization
- class: Windows
+ class: Utilization
+ type: Windows
component: Memory
- type: Utilization
os: linux
hosts: *
calc: ($used) * 100 / ($used + $available)
@@ -56,9 +38,9 @@ component: Memory
template: wmi_swap_in_use
on: wmi.memory_swap_utilization
- class: Windows
+ class: Utilization
+ type: Windows
component: Memory
- type: Utilization
os: linux
hosts: *
calc: ($used) * 100 / ($used + $available)
@@ -75,9 +57,9 @@ component: Memory
template: wmi_inbound_packets_discarded
on: wmi.net_discarded
- class: Windows
+ class: Errors
+ type: Windows
component: Network
- type: Errors
os: linux
hosts: *
families: *
@@ -91,9 +73,9 @@ component: Network
template: wmi_outbound_packets_discarded
on: wmi.net_discarded
- class: Windows
+ class: Errors
+ type: Windows
component: Network
- type: Errors
os: linux
hosts: *
families: *
@@ -107,9 +89,9 @@ component: Network
template: wmi_inbound_packets_errors
on: wmi.net_errors
- class: Windows
+ class: Errors
+ type: Windows
component: Network
- type: Errors
os: linux
hosts: *
families: *
@@ -123,9 +105,9 @@ component: Network
template: wmi_outbound_packets_errors
on: wmi.net_errors
- class: Windows
+ class: Errors
+ type: Windows
component: Network
- type: Errors
os: linux
hosts: *
families: *
@@ -142,9 +124,9 @@ component: Network
template: wmi_disk_in_use
on: wmi.logical_disk_utilization
- class: Windows
+ class: Utilization
+ type: Windows
component: Disk
- type: Utilization
os: linux
hosts: *
calc: ($used) * 100 / ($used + $free)
diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf
index 93c406b7a..fc69d0288 100644
--- a/health/health.d/x509check.conf
+++ b/health/health.d/x509check.conf
@@ -1,26 +1,9 @@
-# make sure x509check is running
-
- template: x509check_last_collected_secs
- on: x509check.time_until_expiration
- class: Certificates
-component: x509 certificates
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 60s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
-
template: x509check_days_until_expiration
on: x509check.time_until_expiration
- class: Certificates
+ class: Latency
+ type: Certificates
component: x509 certificates
- type: Latency
calc: $expiry
units: seconds
every: 60s
@@ -31,9 +14,9 @@ component: x509 certificates
template: x509check_revocation_status
on: x509check.revocation_status
- class: Certificates
+ class: Errors
+ type: Certificates
component: x509 certificates
- type: Errors
calc: $revoked
every: 60s
crit: $this != nan AND $this != 0
diff --git a/health/health.d/zfs.conf b/health/health.d/zfs.conf
index d6f5fa2fe..785838d47 100644
--- a/health/health.d/zfs.conf
+++ b/health/health.d/zfs.conf
@@ -1,9 +1,9 @@
alarm: zfs_memory_throttle
on: zfs.memory_ops
- class: System
+ class: Utilization
+ type: System
component: File system
- type: Utilization
lookup: sum -10m unaligned absolute of throttled
units: events
every: 1m
@@ -16,9 +16,9 @@ component: File system
template: zfs_pool_state_warn
on: zfspool.state
- class: System
+ class: Errors
+ type: System
component: File system
- type: Errors
calc: $degraded
units: boolean
every: 10s
@@ -29,9 +29,9 @@ component: File system
template: zfs_pool_state_crit
on: zfspool.state
- class: System
+ class: Errors
+ type: System
component: File system
- type: Errors
calc: $faulted + $unavail
units: boolean
every: 10s
diff --git a/health/health.d/zookeeper.conf b/health/health.d/zookeeper.conf
deleted file mode 100644
index 8c7d5a73d..000000000
--- a/health/health.d/zookeeper.conf
+++ /dev/null
@@ -1,17 +0,0 @@
-
-# make sure zookeeper is running
-
- template: zookeeper_last_collected_secs
- on: zookeeper.requests
- class: KV Storage
-component: ZooKeeper
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-