diff options
Diffstat (limited to '')
84 files changed, 616 insertions, 1180 deletions
diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf index b067e1840..1d823addd 100644 --- a/health/health.d/adaptec_raid.conf +++ b/health/health.d/adaptec_raid.conf @@ -3,9 +3,9 @@ template: adaptec_raid_ld_status on: adaptec_raid.ld_status - class: System + class: Errors + type: System component: RAID - type: Errors lookup: max -10s foreach * units: bool every: 10s @@ -18,9 +18,9 @@ component: RAID template: adaptec_raid_pd_state on: adaptec_raid.pd_state - class: System + class: Errors + type: System component: RAID - type: Errors lookup: max -10s foreach * units: bool every: 10s diff --git a/health/health.d/am2320.conf b/health/health.d/am2320.conf deleted file mode 100644 index 4bac98fbb..000000000 --- a/health/health.d/am2320.conf +++ /dev/null @@ -1,15 +0,0 @@ -# make sure am2320 is sending stats - - template: am2320_last_collected_secs - on: am2320.temperature - class: Other -component: Sensors - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf index f27e39fc1..269ae544b 100644 --- a/health/health.d/anomalies.conf +++ b/health/health.d/anomalies.conf @@ -2,9 +2,9 @@ template: anomalies_anomaly_probabilities on: anomalies.probability - class: Netdata + class: Errors + type: Netdata component: ML - type: Errors lookup: average -2m foreach * every: 1m warn: $this > 50 @@ -14,9 +14,9 @@ component: ML template: anomalies_anomaly_flags on: anomalies.anomaly - class: Netdata + class: Errors + type: Netdata component: ML - type: Errors lookup: sum -2m foreach * every: 1m warn: $this > 10 diff --git a/health/health.d/apache.conf b/health/health.d/apache.conf deleted file mode 100644 index c623fb880..000000000 --- a/health/health.d/apache.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure apache is running - - template: apache_last_collected_secs - on: apache.requests - class: Web Server -component: Apache - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf index 07b5c28c9..65f1a69ab 100644 --- a/health/health.d/apcupsd.conf +++ b/health/health.d/apcupsd.conf @@ -2,9 +2,9 @@ template: apcupsd_10min_ups_load on: apcupsd.load - class: Power Supply + class: Utilization + type: Power Supply component: UPS - type: Utilization os: * hosts: * lookup: average -10m unaligned of percentage @@ -20,9 +20,9 @@ component: UPS # Fire the alarm as soon as it's going on battery (99% charge) and clear only when full. template: apcupsd_ups_charge on: apcupsd.charge - class: Power Supply + class: Errors + type: Power Supply component: UPS - type: Errors os: * hosts: * lookup: average -60s unaligned of charge @@ -36,9 +36,9 @@ component: UPS template: apcupsd_last_collected_secs on: apcupsd.load - class: Power Supply + class: Latency + type: Power Supply component: UPS device - type: Latency calc: $now - $last_collected_t every: 10s units: seconds ago diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf index 948ea551a..91d469395 100644 --- a/health/health.d/backend.conf +++ b/health/health.d/backend.conf @@ -1,9 +1,9 @@ # Alert that backends subsystem will be disabled soon alarm: backend_metrics_eol on: netdata.backend_metrics - class: Netdata + class: Errors + type: Netdata component: Exporting engine - type: Errors units: boolean calc: $now - $last_collected_t every: 1m @@ -16,9 +16,9 @@ component: Exporting engine alarm: backend_last_buffering on: netdata.backend_metrics - class: Netdata + class: Latency + type: Netdata component: Exporting engine - type: Latency calc: $now - $last_collected_t units: seconds ago every: 10s @@ -30,9 +30,9 @@ component: Exporting engine alarm: backend_metrics_sent on: netdata.backend_metrics - class: Netdata + class: Workload + type: Netdata component: Exporting engine - type: Workload units: % calc: abs($sent) * 100 / abs($buffered) every: 10s diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf index d75d8e19b..49cb5ad0f 100644 --- a/health/health.d/bcache.conf +++ b/health/health.d/bcache.conf @@ -1,9 +1,9 @@ template: bcache_cache_errors on: disk.bcache_cache_read_races - class: System + class: Errors + type: System component: Disk - type: Errors lookup: sum -1m unaligned absolute units: errors every: 1m @@ -16,9 +16,9 @@ component: Disk template: bcache_cache_dirty on: disk.bcache_cache_alloc - class: System + class: Utilization + type: System component: Disk - type: Utilization calc: $dirty + $metadata + $undefined units: % every: 1m diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf index 99c754571..13ac8c182 100644 --- a/health/health.d/beanstalkd.conf +++ b/health/health.d/beanstalkd.conf @@ -2,9 +2,9 @@ template: beanstalk_server_buried_jobs on: beanstalk.current_jobs - class: Messaging + class: Workload + type: Messaging component: Beanstalk - type: Workload calc: $buried units: jobs every: 10s diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf index e88f87a4f..7c09225ff 100644 --- a/health/health.d/bind_rndc.conf +++ b/health/health.d/bind_rndc.conf @@ -1,8 +1,8 @@ template: bind_rndc_stats_file_size on: bind_rndc.stats_size - class: DNS + class: Utilization + type: DNS component: BIND - type: Utilization units: megabytes every: 60 calc: $stats_size diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf index 8604abee9..7d7a4fdae 100644 --- a/health/health.d/boinc.conf +++ b/health/health.d/boinc.conf @@ -3,9 +3,9 @@ # Warn on any compute errors encountered. template: boinc_compute_errors on: boinc.states - class: Computing + class: Errors + type: Computing component: BOINC - type: Errors os: * hosts: * families: * @@ -21,9 +21,9 @@ component: BOINC # Warn on lots of upload errors template: boinc_upload_errors on: boinc.states - class: Computing + class: Errors + type: Computing component: BOINC - type: Errors os: * hosts: * families: * @@ -39,9 +39,9 @@ component: BOINC # Warn on the task queue being empty template: boinc_total_tasks on: boinc.tasks - class: Computing + class: Utilization + type: Computing component: BOINC - type: Utilization os: * hosts: * families: * @@ -57,9 +57,9 @@ component: BOINC # Warn on no active tasks with a non-empty queue template: boinc_active_tasks on: boinc.tasks - class: Computing + class: Utilization + type: Computing component: BOINC - type: Utilization os: * hosts: * families: * diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf index d3200a7ee..8d197aa8d 100644 --- a/health/health.d/btrfs.conf +++ b/health/health.d/btrfs.conf @@ -1,9 +1,9 @@ template: btrfs_allocated on: btrfs.disk - class: System + class: Utilization + type: System component: File system - type: Utilization os: * hosts: * families: * @@ -18,9 +18,9 @@ component: File system template: btrfs_data on: btrfs.data - class: System + class: Utilization + type: System component: File system - type: Utilization os: * hosts: * families: * @@ -35,9 +35,9 @@ component: File system template: btrfs_metadata on: btrfs.metadata - class: System + class: Utilization + type: System component: File system - type: Utilization os: * hosts: * families: * @@ -52,9 +52,9 @@ component: File system template: btrfs_system on: btrfs.system - class: System + class: Utilization + type: System component: File system - type: Utilization os: * hosts: * families: * diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf index ed8f9b4b9..1f9da25c7 100644 --- a/health/health.d/ceph.conf +++ b/health/health.d/ceph.conf @@ -2,9 +2,9 @@ template: ceph_cluster_space_usage on: ceph.general_usage - class: Storage + class: Utilization + type: Storage component: Ceph - type: Utilization calc: $used * 100 / ($used + $avail) units: % every: 1m diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf index 068533f10..45b34806c 100644 --- a/health/health.d/cgroups.conf +++ b/health/health.d/cgroups.conf @@ -3,9 +3,9 @@ template: cgroup_10min_cpu_usage on: cgroup.cpu_limit - class: Cgroups + class: Utilization + type: Cgroups component: CPU - type: Utilization os: linux hosts: * lookup: average -10m unaligned @@ -19,9 +19,9 @@ component: CPU template: cgroup_ram_in_use on: cgroup.mem_usage - class: Cgroups + class: Utilization + type: Cgroups component: Memory - type: Utilization os: linux hosts: * calc: ($ram) * 100 / $memory_limit diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf index dccd2b064..1f227841e 100644 --- a/health/health.d/cockroachdb.conf +++ b/health/health.d/cockroachdb.conf @@ -1,27 +1,11 @@ -# Availability - - template: cockroachdb_last_collected_secs - on: cockroachdb.live_nodes - class: Database -component: CockroachDB - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba - # Capacity template: cockroachdb_used_storage_capacity on: cockroachdb.storage_used_capacity_percentage - class: Database + class: Utilization + type: Database component: CockroachDB - type: Utilization calc: $capacity_used_percent units: % every: 10s @@ -33,9 +17,9 @@ component: CockroachDB template: cockroachdb_used_usable_storage_capacity on: cockroachdb.storage_used_capacity_percentage - class: Database + class: Utilization + type: Database component: CockroachDB - type: Utilization calc: $capacity_usable_used_percent units: % every: 10s @@ -49,37 +33,37 @@ component: CockroachDB template: cockroachdb_unavailable_ranges on: cockroachdb.ranges_replication_problem - class: Database + class: Errors + type: Database component: CockroachDB - type: Utilization calc: $ranges_unavailable units: num every: 10s warn: $this > 0 delay: down 15m multiplier 1.5 max 1h - info: number of ranges with fewer live replicas than the replication target + info: number of ranges with fewer live replicas than needed for quorum to: dba - template: cockroachdb_replicas_leaders_not_leaseholders - on: cockroachdb.replicas_leaders - class: Database + template: cockroachdb_underreplicated_ranges + on: cockroachdb.ranges_replication_problem + class: Errors + type: Database component: CockroachDB - type: Utilization - calc: $replicas_leaders_not_leaseholders + calc: $ranges_underreplicated units: num every: 10s warn: $this > 0 delay: down 15m multiplier 1.5 max 1h - info: number of replicas that are Raft leaders whose range lease is held by another store + info: number of ranges with fewer live replicas than the replication target to: dba # FD template: cockroachdb_open_file_descriptors_limit on: cockroachdb.process_file_descriptors - class: Database + class: Utilization + type: Database component: CockroachDB - type: Utilization calc: $sys_fd_open/$sys_fd_softlimit * 100 units: % every: 10s @@ -87,29 +71,3 @@ component: CockroachDB delay: down 15m multiplier 1.5 max 1h info: open file descriptors utilization (against softlimit) to: dba - -# SQL - - template: cockroachdb_sql_active_connections - on: cockroachdb.sql_connections - class: Database -component: CockroachDB - type: Utilization - calc: $sql_conns - units: active connections - every: 10s - info: number of active SQL connections - to: dba - - template: cockroachdb_sql_executed_statements_total_last_5m - on: cockroachdb.sql_statements_total - class: Database -component: CockroachDB - type: Workload - lookup: sum -5m absolute of sql_query_count - units: statements - every: 10s - warn: $this == 0 AND $cockroachdb_sql_active_connections != 0 - delay: down 15m up 30s multiplier 1.5 max 1h - info: number of executed SQL statements in the last 5 minutes - to: dba diff --git a/health/health.d/couchdb.conf b/health/health.d/couchdb.conf deleted file mode 100644 index c86c6b988..000000000 --- a/health/health.d/couchdb.conf +++ /dev/null @@ -1,16 +0,0 @@ - -# make sure couchdb is running - - template: couchdb_last_collected_secs - on: couchdb.request_methods - class: Database -component: CouchDB - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf index d11215768..ad6952825 100644 --- a/health/health.d/cpu.conf +++ b/health/health.d/cpu.conf @@ -3,9 +3,9 @@ template: 10min_cpu_usage on: system.cpu - class: System + class: Utilization + type: System component: CPU - type: Utilization os: linux hosts: * lookup: average -10m unaligned of user,system,softirq,irq,guest @@ -19,9 +19,9 @@ component: CPU template: 10min_cpu_iowait on: system.cpu - class: System + class: Utilization + type: System component: CPU - type: Utilization os: linux hosts: * lookup: average -10m unaligned of iowait @@ -35,9 +35,9 @@ component: CPU template: 20min_steal_cpu on: system.cpu - class: System + class: Latency + type: System component: CPU - type: Latency os: linux hosts: * lookup: average -20m unaligned of steal @@ -52,9 +52,9 @@ component: CPU ## FreeBSD template: 10min_cpu_usage on: system.cpu - class: System + class: Utilization + type: System component: CPU - type: Utilization os: freebsd hosts: * lookup: average -10m unaligned of user,system,interrupt diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf index 79c156ab8..65c41b846 100644 --- a/health/health.d/dbengine.conf +++ b/health/health.d/dbengine.conf @@ -3,9 +3,9 @@ alarm: 10min_dbengine_global_fs_errors on: netdata.dbengine_global_errors - class: Netdata + class: Errors + type: Netdata component: DB engine - type: Errors os: linux freebsd macos hosts: * lookup: sum -10m unaligned of fs_errors @@ -18,9 +18,9 @@ component: DB engine alarm: 10min_dbengine_global_io_errors on: netdata.dbengine_global_errors - class: Netdata + class: Errors + type: Netdata component: DB engine - type: Errors os: linux freebsd macos hosts: * lookup: sum -10m unaligned of io_errors @@ -33,9 +33,9 @@ component: DB engine alarm: 10min_dbengine_global_flushing_warnings on: netdata.dbengine_global_errors - class: Netdata + class: Errors + type: Netdata component: DB engine - type: Errors os: linux freebsd macos hosts: * lookup: sum -10m unaligned of pg_cache_over_half_dirty_events @@ -49,9 +49,9 @@ component: DB engine alarm: 10min_dbengine_global_flushing_errors on: netdata.dbengine_long_term_page_stats - class: Netdata + class: Errors + type: Netdata component: DB engine - type: Errors os: linux freebsd macos hosts: * lookup: sum -10m unaligned of flushing_pressure_deletions diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf index 60f8faed9..5daff61a1 100644 --- a/health/health.d/disks.conf +++ b/health/health.d/disks.conf @@ -11,9 +11,9 @@ template: disk_space_usage on: disk.space - class: System + class: Utilization + type: System component: Disk - type: Utilization os: linux freebsd hosts: * families: !/dev !/dev/* !/run !/run/* * @@ -28,9 +28,9 @@ component: Disk template: disk_inode_usage on: disk.inodes - class: System + class: Utilization + type: System component: Disk - type: Utilization os: linux freebsd hosts: * families: !/dev !/dev/* !/run !/run/* * @@ -136,19 +136,16 @@ component: Disk template: 10min_disk_utilization on: disk.util - class: System + class: Utilization + type: System component: Disk - type: Utilization os: linux freebsd hosts: * families: * lookup: average -10m unaligned units: % every: 1m - green: 90 - red: 98 - warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1)) - crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1)) + warn: $this > 98 * (($status >= $WARNING) ? (0.7) : (1)) delay: down 15m multiplier 1.2 max 1h info: average percentage of time $family disk was busy over the last 10 minutes to: silent @@ -161,19 +158,16 @@ component: Disk template: 10min_disk_backlog on: disk.backlog - class: System + class: Latency + type: System component: Disk - type: Latency os: linux hosts: * families: * lookup: average -10m unaligned units: ms every: 1m - green: 2000 - red: 5000 - warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1)) - crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1)) + warn: $this > 5000 * (($status >= $WARNING) ? (0.7) : (1)) delay: down 15m multiplier 1.2 max 1h info: average backlog size of the $family disk over the last 10 minutes to: silent diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf index 1fbb2c598..ec4937c0a 100644 --- a/health/health.d/dns_query.conf +++ b/health/health.d/dns_query.conf @@ -3,9 +3,9 @@ template: dns_query_time_query_time on: dns_query_time.query_time - class: DNS + class: Latency + type: DNS component: DNS - type: Latency lookup: average -10s unaligned foreach * units: ms every: 10s diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf index 10d139f77..010b94599 100644 --- a/health/health.d/dnsmasq_dhcp.conf +++ b/health/health.d/dnsmasq_dhcp.conf @@ -2,9 +2,9 @@ template: dnsmasq_dhcp_dhcp_range_utilization on: dnsmasq_dhcp.dhcp_range_utilization - class: DHCP + class: Utilization + type: DHCP component: Dnsmasq - type: Utilization every: 10s units: % calc: $used diff --git a/health/health.d/dockerd.conf b/health/health.d/dockerd.conf index ba866f81b..220ddd664 100644 --- a/health/health.d/dockerd.conf +++ b/health/health.d/dockerd.conf @@ -1,8 +1,8 @@ template: docker_unhealthy_containers on: docker.unhealthy_containers - class: Containers + class: Errors + type: Containers component: Docker - type: Errors units: unhealthy containers every: 10s lookup: average -10s diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf deleted file mode 100644 index 05d576c39..000000000 --- a/health/health.d/elasticsearch.conf +++ /dev/null @@ -1,15 +0,0 @@ - -# make sure elasticsearch is running - - template: elasticsearch_last_collected - on: elasticsearch.cluster_health_status - class: Search engine -component: Elasticsearch - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - info: number of seconds since the last successful data collection - to: sysadmin diff --git a/health/health.d/entropy.conf b/health/health.d/entropy.conf index 0478fa0be..13b0fcde4 100644 --- a/health/health.d/entropy.conf +++ b/health/health.d/entropy.conf @@ -5,9 +5,9 @@ alarm: lowest_entropy on: system.entropy - class: System + class: Utilization + type: System component: Cryptography - type: Utilization os: linux hosts: * lookup: min -5m unaligned diff --git a/health/health.d/exporting.conf b/health/health.d/exporting.conf index 4430f3fd8..06f398c6e 100644 --- a/health/health.d/exporting.conf +++ b/health/health.d/exporting.conf @@ -1,22 +1,25 @@ -template: exporting_last_buffering -families: * - on: exporting_data_size - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful buffering of exporting data - to: dba + template: exporting_last_buffering + families: * + on: exporting_data_size + class: Latency + type: Netdata +component: Exporting engine + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful buffering of exporting data + to: dba template: exporting_metrics_sent families: * on: exporting_data_size - class: Netdata + class: Workload + type: Netdata component: Exporting engine - type: Workload units: % calc: abs($sent) * 100 / abs($buffered) every: 10s diff --git a/health/health.d/fping.conf b/health/health.d/fping.conf index 120fe8f28..bb22419fa 100644 --- a/health/health.d/fping.conf +++ b/health/health.d/fping.conf @@ -2,9 +2,9 @@ template: fping_last_collected_secs families: * on: fping.latency - class: Other + class: Latency + type: Other component: Network - type: Latency calc: $now - $last_collected_t units: seconds ago every: 10s @@ -17,9 +17,9 @@ component: Network template: fping_host_reachable families: * on: fping.latency - class: Other + class: Errors + type: Other component: Network - type: Errors calc: $average != nan units: up/down every: 10s @@ -31,9 +31,9 @@ component: Network template: fping_host_latency families: * on: fping.latency - class: Other + class: Latency + type: Other component: Network - type: Latency lookup: average -10s unaligned of average units: ms every: 10s @@ -48,9 +48,9 @@ component: Network template: fping_packet_loss families: * on: fping.quality - class: System + class: Errors + type: System component: Network - type: Errors lookup: average -10m unaligned of returned calc: 100 - $this green: 1 diff --git a/health/health.d/fronius.conf b/health/health.d/fronius.conf index 81aafaa60..853bd7fbc 100644 --- a/health/health.d/fronius.conf +++ b/health/health.d/fronius.conf @@ -1,9 +1,9 @@ template: fronius_last_collected_secs families: * on: fronius.power - class: Power Supply + class: Latency + type: Power Supply component: Solar - type: Latency calc: $now - $last_collected_t every: 10s units: seconds ago diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf index e2031bf2b..14010d445 100644 --- a/health/health.d/gearman.conf +++ b/health/health.d/gearman.conf @@ -1,24 +1,10 @@ -# make sure Gearman is running - template: gearman_last_collected_secs - on: gearman.total_jobs - class: Computing -component: Gearman - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin template: gearman_workers_queued on: gearman.single_job - class: Computing + class: Latency + type: Computing component: Gearman - type: Latency - lookup: average -10m unaligned match-names of Queued + lookup: average -10m unaligned match-names of Pending units: workers every: 10s warn: $this > 30000 diff --git a/health/health.d/geth.conf b/health/health.d/geth.conf new file mode 100644 index 000000000..dd1eb4701 --- /dev/null +++ b/health/health.d/geth.conf @@ -0,0 +1,12 @@ +#chainhead_header is expected momenterarily to be ahead. If its considerably ahead (e.g more than 5 blocks), then the node is definitely out of sync. + template: geth_chainhead_diff_between_header_block + on: geth.chainhead + class: Workload + type: ethereum_node +component: geth + every: 10s + calc: $chain_head_block - $chain_head_header + units: blocks + warn: $this != 0 + crit: $this > 5 + delay: down 1m multiplier 1.5 max 1h diff --git a/health/health.d/lighttpd.conf b/health/health.d/go.d.plugin.conf index 0f067549e..8bf84a976 100644 --- a/health/health.d/lighttpd.conf +++ b/health/health.d/go.d.plugin.conf @@ -1,11 +1,12 @@ -# make sure lighttpd is running +# make sure go.d.plugin data collection job is running - template: lighttpd_last_collected_secs - on: lighttpd.requests - class: Web Server -component: Lighttpd - type: Latency + template: go.d_job_last_collected_secs + on: netdata.go_plugin_execution_time + class: Error + type: Netdata +component: go.d.plugin + module: * calc: $now - $last_collected_t units: seconds ago every: 10s @@ -14,4 +15,3 @@ component: Lighttpd delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: webmaster - diff --git a/health/health.d/haproxy.conf b/health/health.d/haproxy.conf index 9f6b1c577..a0ab52bca 100644 --- a/health/health.d/haproxy.conf +++ b/health/health.d/haproxy.conf @@ -1,8 +1,8 @@ template: haproxy_backend_server_status on: haproxy_hs.down - class: Web Proxy + class: Errors + type: Web Proxy component: HAProxy - type: Errors units: failed servers every: 10s lookup: average -10s @@ -12,25 +12,12 @@ component: HAProxy template: haproxy_backend_status on: haproxy_hb.down - class: Web Proxy + class: Errors + type: Web Proxy component: HAProxy - type: Errors units: failed backend every: 10s lookup: average -10s crit: $this > 0 info: average number of failed haproxy backends over the last 10 seconds to: sysadmin - - template: haproxy_last_collected - on: haproxy_hb.down - class: Web Proxy -component: HAProxy - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - info: number of seconds since the last successful data collection - to: sysadmin diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf index bd8308bed..ca8df31b9 100644 --- a/health/health.d/hdfs.conf +++ b/health/health.d/hdfs.conf @@ -1,28 +1,11 @@ -# make sure hdfs is running - - template: hdfs_last_collected_secs - on: hdfs.heap_memory - class: Storage -component: HDFS - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - - # Common template: hdfs_capacity_usage on: hdfs.capacity - class: Storage + class: Utilization + type: Storage component: HDFS - type: Utilization calc: ($used) * 100 / ($used + $remaining) units: % every: 10s @@ -37,9 +20,9 @@ component: HDFS template: hdfs_missing_blocks on: hdfs.blocks - class: Storage + class: Errors + type: Storage component: HDFS - type: Errors calc: $missing units: missing blocks every: 10s @@ -51,9 +34,9 @@ component: HDFS template: hdfs_stale_nodes on: hdfs.data_nodes - class: Storage + class: Errors + type: Storage component: HDFS - type: Errors calc: $stale units: dead nodes every: 10s @@ -65,9 +48,9 @@ component: HDFS template: hdfs_dead_nodes on: hdfs.data_nodes - class: Storage + class: Errors + type: Storage component: HDFS - type: Errors calc: $dead units: dead nodes every: 10s @@ -81,9 +64,9 @@ component: HDFS template: hdfs_num_failed_volumes on: hdfs.num_failed_volumes - class: Storage + class: Errors + type: Storage component: HDFS - type: Errors calc: $fsds_num_failed_volumes units: failed volumes every: 10s diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf index d4d6376a3..599c47acc 100644 --- a/health/health.d/httpcheck.conf +++ b/health/health.d/httpcheck.conf @@ -1,25 +1,11 @@ - template: httpcheck_last_collected_secs - families: * - on: httpcheck.status - class: Other -component: HTTP endpoint - type: Latency - calc: $now - $last_collected_t - every: 10s - units: seconds ago - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges template: httpcheck_web_service_up families: * on: httpcheck.status - class: Web Server + class: Utilization + type: Web Server component: HTTP endpoint - type: Utilization lookup: average -1m unaligned percentage of success calc: ($this < 75) ? (0) : ($this) every: 5s @@ -30,9 +16,9 @@ component: HTTP endpoint template: httpcheck_web_service_bad_content families: * on: httpcheck.status - class: Web Server + class: Workload + type: Web Server component: HTTP endpoint - type: Workload lookup: average -5m unaligned percentage of bad_content every: 10s units: % @@ -46,9 +32,9 @@ component: HTTP endpoint template: httpcheck_web_service_bad_status families: * on: httpcheck.status - class: Web Server + class: Workload + type: Web Server component: HTTP endpoint - type: Workload lookup: average -5m unaligned percentage of bad_status every: 10s units: % @@ -62,9 +48,9 @@ component: HTTP endpoint template: httpcheck_web_service_timeouts families: * on: httpcheck.status - class: Web Server + class: Latency + type: Web Server component: HTTP endpoint - type: Latency lookup: average -5m unaligned percentage of timeout every: 10s units: % @@ -73,9 +59,9 @@ component: HTTP endpoint template: httpcheck_no_web_service_connections families: * on: httpcheck.status - class: Other + class: Errors + type: Other component: HTTP endpoint - type: Errors lookup: average -5m unaligned percentage of no_connection every: 10s units: % @@ -85,9 +71,9 @@ component: HTTP endpoint template: httpcheck_web_service_unreachable families: * on: httpcheck.status - class: Web Server + class: Errors + type: Web Server component: HTTP endpoint - type: Errors calc: ($httpcheck_no_web_service_connections >= $httpcheck_web_service_timeouts) ? ($httpcheck_no_web_service_connections) : ($httpcheck_web_service_timeouts) units: % every: 10s @@ -101,9 +87,9 @@ component: HTTP endpoint template: httpcheck_1h_web_service_response_time families: * on: httpcheck.responsetime - class: Other + class: Latency + type: Other component: HTTP endpoint - type: Latency lookup: average -1h unaligned of time every: 30s units: ms @@ -112,9 +98,9 @@ component: HTTP endpoint template: httpcheck_web_service_slow families: * on: httpcheck.responsetime - class: Web Server + class: Latency + type: Web Server component: HTTP endpoint - type: Latency lookup: average -3m unaligned of time units: ms every: 10s diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf index 57ce4e866..ee4befbea 100644 --- a/health/health.d/ioping.conf +++ b/health/health.d/ioping.conf @@ -1,9 +1,9 @@ template: ioping_disk_latency families: * on: ioping.latency - class: System + class: Latency + type: System component: Disk - type: Latency lookup: average -10s unaligned of average units: ms every: 10s diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf index 6eaf7abe9..c178a410a 100644 --- a/health/health.d/ipc.conf +++ b/health/health.d/ipc.conf @@ -3,9 +3,9 @@ alarm: semaphores_used on: system.ipc_semaphores - class: System + class: Utilization + type: System component: IPC - type: Utilization os: linux hosts: * calc: $semaphores * 100 / $ipc_semaphores_max @@ -19,9 +19,9 @@ component: IPC alarm: semaphore_arrays_used on: system.ipc_semaphore_arrays - class: System + class: Utilization + type: System component: IPC - type: Utilization os: linux hosts: * calc: $arrays * 100 / $ipc_semaphores_arrays_max diff --git a/health/health.d/ipfs.conf b/health/health.d/ipfs.conf index 6268f4092..a514ddfd0 100644 --- a/health/health.d/ipfs.conf +++ b/health/health.d/ipfs.conf @@ -1,9 +1,9 @@ template: ipfs_datastore_usage on: ipfs.repo_size - class: Data Sharing + class: Utilization + type: Data Sharing component: IPFS - type: Utilization calc: $size * 100 / $avail units: % every: 10s diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf index d4fdc6c79..feadba1b7 100644 --- a/health/health.d/ipmi.conf +++ b/health/health.d/ipmi.conf @@ -1,8 +1,8 @@ alarm: ipmi_sensors_states on: ipmi.sensors_states - class: System + class: Errors + type: System component: IPMI - type: Errors calc: $warning + $critical units: sensors every: 10s @@ -14,9 +14,9 @@ component: IPMI alarm: ipmi_events on: ipmi.events - class: System + class: Utilization + type: System component: IPMI - type: Utilization calc: $events units: events every: 10s diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf index 4d3c45f97..c2778cc5e 100644 --- a/health/health.d/kubelet.conf +++ b/health/health.d/kubelet.conf @@ -6,9 +6,9 @@ template: kubelet_node_config_error on: k8s_kubelet.kubelet_node_config_error - class: Kubernetes + class: Errors + type: Kubernetes component: Kubelet - type: Errors calc: $kubelet_node_config_error units: bool every: 10s @@ -22,9 +22,9 @@ component: Kubelet template: kubelet_token_requests lookup: sum -10s of token_fail_count on: k8s_kubelet.kubelet_token_requests - class: Kubernetes + class: Errors + type: Kubernetes component: Kubelet - type: Errors units: failed requests every: 10s warn: $this > 0 @@ -37,9 +37,9 @@ component: Kubelet template: kubelet_operations_error lookup: sum -1m on: k8s_kubelet.kubelet_operations_errors - class: Kubernetes + class: Errors + type: Kubernetes component: Kubelet - type: Errors units: errors every: 10s warn: $this > (($status >= $WARNING) ? (0) : (20)) @@ -64,9 +64,9 @@ component: Kubelet template: kubelet_1m_pleg_relist_latency_quantile_05 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Kubernetes + class: Latency + type: Kubernetes component: Kubelet - type: Latency lookup: average -1m unaligned of kubelet_pleg_relist_latency_05 units: microseconds every: 10s @@ -74,9 +74,9 @@ component: Kubelet template: kubelet_10s_pleg_relist_latency_quantile_05 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Kubernetes + class: Latency + type: Kubernetes component: Kubelet - type: Latency lookup: average -10s unaligned of kubelet_pleg_relist_latency_05 calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05)) every: 10s @@ -92,9 +92,9 @@ component: Kubelet template: kubelet_1m_pleg_relist_latency_quantile_09 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Kubernetes + class: Latency + type: Kubernetes component: Kubelet - type: Latency lookup: average -1m unaligned of kubelet_pleg_relist_latency_09 units: microseconds every: 10s @@ -102,9 +102,9 @@ component: Kubelet template: kubelet_10s_pleg_relist_latency_quantile_09 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Kubernetes + class: Latency + type: Kubernetes component: Kubelet - type: Latency lookup: average -10s unaligned of kubelet_pleg_relist_latency_09 calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09)) every: 10s @@ -120,9 +120,9 @@ component: Kubelet template: kubelet_1m_pleg_relist_latency_quantile_099 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Kubernetes + class: Latency + type: Kubernetes component: Kubelet - type: Latency lookup: average -1m unaligned of kubelet_pleg_relist_latency_099 units: microseconds every: 10s @@ -130,9 +130,9 @@ component: Kubelet template: kubelet_10s_pleg_relist_latency_quantile_099 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Kubernetes + class: Latency + type: Kubernetes component: Kubelet - type: Latency lookup: average -10s unaligned of kubelet_pleg_relist_latency_099 calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099)) every: 10s diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf index e28c246a3..c0bc6de8a 100644 --- a/health/health.d/linux_power_supply.conf +++ b/health/health.d/linux_power_supply.conf @@ -2,9 +2,9 @@ template: linux_power_supply_capacity on: powersupply.capacity - class: Power Supply + class: Utilization + type: Power Supply component: Battery - type: Utilization calc: $capacity units: % every: 10s diff --git a/health/health.d/load.conf b/health/health.d/load.conf index e811f6ee2..0bd872f85 100644 --- a/health/health.d/load.conf +++ b/health/health.d/load.conf @@ -6,9 +6,9 @@ # minute, with a special case for a single CPU of setting the trigger at 2. alarm: load_cpu_number on: system.load - class: System + class: Utilization + type: System component: Load - type: Utilization os: linux hosts: * calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors ) @@ -22,9 +22,9 @@ component: Load alarm: load_average_15 on: system.load - class: System + class: Utilization + type: System component: Load - type: Utilization os: linux hosts: * lookup: max -1m unaligned of load15 @@ -37,9 +37,9 @@ component: Load alarm: load_average_5 on: system.load - class: System + class: Utilization + type: System component: Load - type: Utilization os: linux hosts: * lookup: max -1m unaligned of load5 @@ -52,9 +52,9 @@ component: Load alarm: load_average_1 on: system.load - class: System + class: Utilization + type: System component: Load - type: Utilization os: linux hosts: * lookup: max -1m unaligned of load1 diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf index 67483b201..cedaa000e 100644 --- a/health/health.d/mdstat.conf +++ b/health/health.d/mdstat.conf @@ -1,8 +1,8 @@ template: mdstat_last_collected on: md.disks - class: System + class: Latency + type: System component: RAID - type: Latency calc: $now - $last_collected_t units: seconds ago every: 10s @@ -13,9 +13,9 @@ component: RAID template: mdstat_disks on: md.disks - class: System + class: Errors + type: System component: RAID - type: Errors units: failed devices every: 10s calc: $down @@ -26,9 +26,9 @@ component: RAID template: mdstat_mismatch_cnt on: md.mismatch_cnt - class: System + class: Errors + type: System component: RAID - type: Errors families: !*(raid1) !*(raid10) * units: unsynchronized blocks calc: $count @@ -40,9 +40,9 @@ component: RAID template: mdstat_nonredundant_last_collected on: md.nonredundant - class: System + class: Latency + type: System component: RAID - type: Latency calc: $now - $last_collected_t units: seconds ago every: 10s diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf index 1b6502f62..9fbcfdb92 100644 --- a/health/health.d/megacli.conf +++ b/health/health.d/megacli.conf @@ -3,9 +3,9 @@ template: megacli_adapter_state on: megacli.adapter_degraded - class: System + class: Errors + type: System component: RAID - type: Errors lookup: max -10s foreach * units: boolean every: 10s @@ -18,9 +18,9 @@ component: RAID template: megacli_pd_predictive_failures on: megacli.pd_predictive_failure - class: System + class: Errors + type: System component: RAID - type: Errors lookup: sum -10s foreach * units: predictive failures every: 10s @@ -31,9 +31,9 @@ component: RAID template: megacli_pd_media_errors on: megacli.pd_media_error - class: System + class: Errors + type: System component: RAID - type: Errors lookup: sum -10s foreach * units: media errors every: 10s @@ -46,9 +46,9 @@ component: RAID template: megacli_bbu_relative_charge on: megacli.bbu_relative_charge - class: System + class: Workload + type: System component: RAID - type: Workload lookup: average -10s units: percent every: 10s @@ -59,9 +59,9 @@ component: RAID template: megacli_bbu_cycle_count on: megacli.bbu_cycle_count - class: System + class: Workload + type: System component: RAID - type: Workload lookup: average -10s units: cycles every: 10s diff --git a/health/health.d/memcached.conf b/health/health.d/memcached.conf index f4b734c38..2a2fe4b82 100644 --- a/health/health.d/memcached.conf +++ b/health/health.d/memcached.conf @@ -1,28 +1,11 @@ -# make sure memcached is running - - template: memcached_last_collected_secs - on: memcached.cache - class: KV Storage -component: Memcached - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba - - # detect if memcached cache is full template: memcached_cache_memory_usage on: memcached.cache - class: KV Storage + class: Utilization + type: KV Storage component: Memcached - type: Utilization calc: $used * 100 / ($used + $available) units: % every: 10s @@ -37,9 +20,9 @@ component: Memcached template: memcached_cache_fill_rate on: memcached.cache - class: KV Storage + class: Utilization + type: KV Storage component: Memcached - type: Utilization lookup: min -10m at -50m unaligned of available calc: ($this - $available) / (($now - $after) / 3600) units: KB/hour @@ -51,9 +34,9 @@ component: Memcached template: memcached_out_of_cache_space_time on: memcached.cache - class: KV Storage + class: Utilization + type: KV Storage component: Memcached - type: Utilization calc: ($memcached_cache_fill_rate > 0) ? ($available / $memcached_cache_fill_rate) : (inf) units: hours every: 10s diff --git a/health/health.d/memory.conf b/health/health.d/memory.conf index ab651315f..010cbbd7b 100644 --- a/health/health.d/memory.conf +++ b/health/health.d/memory.conf @@ -3,9 +3,9 @@ alarm: 1hour_ecc_memory_correctable on: mem.ecc_ce - class: System + class: Errors + type: System component: Memory - type: Errors os: linux hosts: * lookup: sum -10m unaligned @@ -18,9 +18,9 @@ component: Memory alarm: 1hour_ecc_memory_uncorrectable on: mem.ecc_ue - class: System + class: Errors + type: System component: Memory - type: Errors os: linux hosts: * lookup: sum -10m unaligned @@ -33,9 +33,9 @@ component: Memory alarm: 1hour_memory_hw_corrupted on: mem.hwcorrupt - class: System + class: Errors + type: System component: Memory - type: Errors os: linux hosts: * calc: $HardwareCorrupted diff --git a/health/health.d/mongodb.conf b/health/health.d/mongodb.conf deleted file mode 100644 index 8c9bdeb6f..000000000 --- a/health/health.d/mongodb.conf +++ /dev/null @@ -1,16 +0,0 @@ - -# make sure mongodb is running - - template: mongodb_last_collected_secs - on: mongodb.read_operations - class: Database -component: MongoDB - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf index 91860c4a7..34452d983 100644 --- a/health/health.d/mysql.conf +++ b/health/health.d/mysql.conf @@ -1,29 +1,11 @@ -# make sure mysql is running - - template: mysql_last_collected_secs - on: mysql.queries - class: Database -component: MySQL - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba - - -# ----------------------------------------------------------------------------- # slow queries template: mysql_10s_slow_queries on: mysql.queries - class: Database + class: Latency + type: Database component: MySQL - type: Latency lookup: sum -10s of slow_queries units: slow queries every: 10s @@ -39,9 +21,9 @@ component: MySQL template: mysql_10s_table_locks_immediate on: mysql.table_locks - class: Database + class: Utilization + type: Database component: MySQL - type: Utilization lookup: sum -10s absolute of immediate units: immediate locks every: 10s @@ -50,9 +32,9 @@ component: MySQL template: mysql_10s_table_locks_waited on: mysql.table_locks - class: Database + class: Latency + type: Database component: MySQL - type: Latency lookup: sum -10s absolute of waited units: waited locks every: 10s @@ -61,9 +43,9 @@ component: MySQL template: mysql_10s_waited_locks_ratio on: mysql.table_locks - class: Database + class: Latency + type: Database component: MySQL - type: Latency calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0 units: % every: 10s @@ -79,9 +61,9 @@ component: MySQL template: mysql_connections on: mysql.connections_active - class: Database + class: Utilization + type: Database component: MySQL - type: Utilization calc: $active * 100 / $limit units: % every: 10s @@ -97,9 +79,9 @@ component: MySQL template: mysql_replication on: mysql.slave_status - class: Database + class: Errors + type: Database component: MySQL - type: Errors calc: ($sql_running <= 0 OR $io_running <= 0)?0:1 units: ok/failed every: 10s @@ -110,9 +92,9 @@ component: MySQL template: mysql_replication_lag on: mysql.slave_behind - class: Database + class: Latency + type: Database component: MySQL - type: Errors calc: $seconds units: seconds every: 10s @@ -129,9 +111,9 @@ component: MySQL template: mysql_galera_cluster_size_max_2m on: mysql.galera_cluster_size - class: Database + class: Utilization + type: Database component: MySQL - type: Utilization lookup: max -2m absolute units: nodes every: 10s @@ -140,9 +122,9 @@ component: MySQL template: mysql_galera_cluster_size on: mysql.galera_cluster_size - class: Database + class: Utilization + type: Database component: MySQL - type: Utilization calc: $nodes units: nodes every: 10s @@ -156,9 +138,9 @@ component: MySQL template: mysql_galera_cluster_state on: mysql.galera_cluster_state - class: Database + class: Errors + type: Database component: MySQL - type: Errors calc: $state every: 10s warn: $this == 2 OR $this == 3 @@ -173,9 +155,9 @@ component: MySQL template: mysql_galera_cluster_status on: mysql.galera_cluster_status - class: Database + class: Errors + type: Database component: MySQL - type: Errors calc: $wsrep_cluster_status every: 10s crit: $mysql_galera_cluster_state != nan AND $this != 0 diff --git a/health/health.d/named.conf b/health/health.d/named.conf deleted file mode 100644 index 90266df16..000000000 --- a/health/health.d/named.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure named is running - - template: named_last_collected_secs - on: named.global_queries - class: DNS -component: BIND - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: domainadmin - diff --git a/health/health.d/net.conf b/health/health.d/net.conf index 04219e163..028ca7b81 100644 --- a/health/health.d/net.conf +++ b/health/health.d/net.conf @@ -6,9 +6,9 @@ template: interface_speed on: net.net - class: System + class: Latency + type: System component: Network - type: Latency os: * hosts: * families: * @@ -19,9 +19,9 @@ component: Network template: 1m_received_traffic_overflow on: net.net - class: System + class: Workload + type: System component: Network - type: Workload os: linux hosts: * families: * @@ -36,9 +36,9 @@ component: Network template: 1m_sent_traffic_overflow on: net.net - class: System + class: Workload + type: System component: Network - type: Workload os: linux hosts: * families: * @@ -63,9 +63,9 @@ component: Network template: inbound_packets_dropped on: net.drops - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * families: !net* * @@ -76,9 +76,9 @@ component: Network template: outbound_packets_dropped on: net.drops - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * families: !net* * @@ -89,14 +89,14 @@ component: Network template: inbound_packets_dropped_ratio on: net.packets - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * families: !net* !wl* * lookup: sum -10m unaligned absolute of received - calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0)) + calc: (($inbound_packets_dropped != nan AND $this > 10000) ? ($inbound_packets_dropped * 100 / $this) : (0)) units: % every: 1m warn: $this >= 2 @@ -106,9 +106,9 @@ component: Network template: outbound_packets_dropped_ratio on: net.packets - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * families: !net* !wl* * @@ -123,14 +123,14 @@ component: Network template: wifi_inbound_packets_dropped_ratio on: net.packets - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * families: wl* lookup: sum -10m unaligned absolute of received - calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0)) + calc: (($inbound_packets_dropped != nan AND $this > 10000) ? ($inbound_packets_dropped * 100 / $this) : (0)) units: % every: 1m warn: $this >= 10 @@ -140,9 +140,9 @@ component: Network template: wifi_outbound_packets_dropped_ratio on: net.packets - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * families: wl* @@ -160,9 +160,9 @@ component: Network template: interface_inbound_errors on: net.errors - class: System + class: Errors + type: System component: Network - type: Errors os: freebsd hosts: * families: * @@ -176,9 +176,9 @@ component: Network template: interface_outbound_errors on: net.errors - class: System + class: Errors + type: System component: Network - type: Errors os: freebsd hosts: * families: * @@ -200,9 +200,9 @@ component: Network template: 10min_fifo_errors on: net.fifo - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * families: * @@ -225,9 +225,9 @@ component: Network template: 1m_received_packets_rate on: net.packets - class: System + class: Workload + type: System component: Network - type: Workload os: linux freebsd hosts: * families: * @@ -238,9 +238,9 @@ component: Network template: 10s_received_packets_storm on: net.packets - class: System + class: Workload + type: System component: Network - type: Workload os: linux freebsd hosts: * families: * diff --git a/health/health.d/netfilter.conf b/health/health.d/netfilter.conf index 35c89caf7..7de383fa2 100644 --- a/health/health.d/netfilter.conf +++ b/health/health.d/netfilter.conf @@ -3,9 +3,9 @@ alarm: netfilter_conntrack_full on: netfilter.conntrack_sockets - class: System + class: Workload + type: System component: Network - type: Workload os: linux hosts: * lookup: max -10s unaligned of connections diff --git a/health/health.d/nginx.conf b/health/health.d/nginx.conf deleted file mode 100644 index 30c738f47..000000000 --- a/health/health.d/nginx.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure nginx is running - - template: nginx_last_collected_secs - on: nginx.requests - class: Web Server -component: NGINX - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - diff --git a/health/health.d/phpfpm.conf b/health/health.d/phpfpm.conf deleted file mode 100644 index fc073a944..000000000 --- a/health/health.d/phpfpm.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure phpfpm is running - - template: phpfpm_last_collected_secs - on: phpfpm.requests - class: Web Server -component: PHP-FPM - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf index 72622caed..2e5c1cbfd 100644 --- a/health/health.d/pihole.conf +++ b/health/health.d/pihole.conf @@ -1,45 +1,12 @@ -# Make sure Pi-hole is responding. - - template: pihole_last_collected_secs - on: pihole.dns_queries_total - class: Ad Filtering -component: Pi-hole - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - -# Blocked DNS queries. - - template: pihole_blocked_queries - on: pihole.dns_queries_percentage - class: Ad Filtering -component: Pi-hole - type: Errors - every: 10s - units: % - calc: $blocked - warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) ) - crit: $this > ( ($status == $CRITICAL) ? ( 55 ) : ( 75 ) ) - delay: up 2m down 5m - info: percentage of blocked dns queries over the last 24 hour - to: sysadmin - - # Blocklist last update time. # Default update interval is a week. template: pihole_blocklist_last_update on: pihole.blocklist_last_update - class: Ad Filtering + class: Errors + type: Ad Filtering component: Pi-hole - type: Errors every: 10s units: seconds calc: $ago @@ -52,15 +19,15 @@ component: Pi-hole template: pihole_blocklist_gravity_file on: pihole.blocklist_last_update - class: Ad Filtering + class: Errors + type: Ad Filtering component: Pi-hole - type: Errors every: 10s units: boolean calc: $file_exists crit: $this != 1 delay: up 2m down 5m - info: gravity.list (blocklist) file existence state (0: exists, 1: not-exists) + info: gravity.list (blocklist) file existence state (0: not-exists, 1: exists) to: sysadmin # Pi-hole's ability to block unwanted domains. @@ -68,13 +35,13 @@ component: Pi-hole template: pihole_status on: pihole.unwanted_domains_blocking_status - class: Ad Filtering + class: Errors + type: Ad Filtering component: Pi-hole - type: Errors every: 10s units: boolean calc: $enabled warn: $this != 1 delay: up 2m down 5m - info: unwanted domains blocking status (0: enabled, 1: disabled) + info: unwanted domains blocking status (0: disabled, 1: enabled) to: sysadmin diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf index b977dbb31..8cbd7729c 100644 --- a/health/health.d/portcheck.conf +++ b/health/health.d/portcheck.conf @@ -1,25 +1,11 @@ - template: portcheck_last_collected_secs - families: * - on: portcheck.status - class: Other -component: TCP endpoint - type: Latency - calc: $now - $last_collected_t - every: 10s - units: seconds ago - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges template: portcheck_service_reachable families: * on: portcheck.status - class: Other + class: Workload + type: Other component: TCP endpoint - type: Workload lookup: average -1m unaligned percentage of success calc: ($this < 75) ? (0) : ($this) every: 5s @@ -30,9 +16,9 @@ component: TCP endpoint template: portcheck_connection_timeouts families: * on: portcheck.status - class: Other + class: Errors + type: Other component: TCP endpoint - type: Errors lookup: average -5m unaligned percentage of timeout every: 10s units: % @@ -45,9 +31,9 @@ component: TCP endpoint template: portcheck_connection_fails families: * on: portcheck.status - class: Other + class: Errors + type: Other component: TCP endpoint - type: Errors lookup: average -5m unaligned percentage of no_connection,failed every: 10s units: % diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf deleted file mode 100644 index f908a802a..000000000 --- a/health/health.d/postgres.conf +++ /dev/null @@ -1,16 +0,0 @@ - -# make sure postgres is running - - template: postgres_last_collected_secs - on: postgres.db_stat_transactions - class: Database -component: PostgreSQL - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf index b44a24c0b..2929ee3d4 100644 --- a/health/health.d/processes.conf +++ b/health/health.d/processes.conf @@ -2,9 +2,9 @@ alarm: active_processes on: system.active_processes - class: System + class: Workload + type: System component: Processes - type: Workload hosts: * calc: $active * 100 / $pidmax units: % diff --git a/health/health.d/pulsar.conf b/health/health.d/pulsar.conf deleted file mode 100644 index 9903d4e38..000000000 --- a/health/health.d/pulsar.conf +++ /dev/null @@ -1,16 +0,0 @@ - -# Availability - - template: pulsar_last_collected_secs - on: pulsar.broker_components - class: Messaging -component: Pulsar - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin diff --git a/health/health.d/nginx_plus.conf b/health/health.d/python.d.plugin.conf index 5849a9e7e..f3abc588f 100644 --- a/health/health.d/nginx_plus.conf +++ b/health/health.d/python.d.plugin.conf @@ -1,11 +1,12 @@ -# make sure nginx_plus is running +# make sure python.d.plugin data collection job is running - template: nginx_plus_last_collected_secs - on: nginx_plus.requests_total - class: Web Server -component: NGINX Plus - type: Latency + template: python.d_job_last_collected_secs + on: netdata.pythond_runtime + class: Error + type: Netdata +component: python.d.plugin + module: * calc: $now - $last_collected_t units: seconds ago every: 10s @@ -14,4 +15,3 @@ component: NGINX Plus delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: webmaster - diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf index 0e3cc29fa..6e6e3b400 100644 --- a/health/health.d/ram.conf +++ b/health/health.d/ram.conf @@ -3,9 +3,9 @@ alarm: used_ram_to_ignore on: system.ram - class: System + class: Utilization + type: System component: Memory - type: Utilization os: linux freebsd hosts: * calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz - $zfs.arc_size.min) @@ -15,13 +15,12 @@ component: Memory alarm: ram_in_use on: system.ram - class: System + class: Utilization + type: System component: Memory - type: Utilization os: linux hosts: * -# calc: $used * 100 / ($used + $cached + $free) - calc: ($used - $used_ram_to_ignore) * 100 / ($used + $cached + $free) + calc: ($used - $used_ram_to_ignore) * 100 / ($used + $cached + $free + $buffers) units: % every: 10s warn: $this > (($status >= $WARNING) ? (80) : (90)) @@ -32,12 +31,12 @@ component: Memory alarm: ram_available on: mem.available - class: System + class: Utilization + type: System component: Memory - type: Utilization os: linux hosts: * - calc: ($avail + $system.ram.used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers) + calc: $avail * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers) units: % every: 10s warn: $this < (($status >= $WARNING) ? (15) : (10)) @@ -46,24 +45,25 @@ component: Memory info: percentage of estimated amount of RAM available for userspace processes, without causing swapping to: sysadmin - alarm: oom_kill - on: mem.oom_kill - os: linux - hosts: * - lookup: sum -1m unaligned - units: kills - every: 10s - warn: $this > 0 - delay: down 5m - info: number of out of memory kills in the last minute - to: sysadmin + alarm: oom_kill + on: mem.oom_kill + os: linux + hosts: * + lookup: sum -30m unaligned + units: kills + every: 5m + warn: $this > 0 + delay: down 10m +host labels: _is_k8s_node = false + info: number of out of memory kills in the last 30 minutes + to: sysadmin ## FreeBSD alarm: ram_in_use on: system.ram - class: System + class: Utilization + type: System component: Memory - type: Utilization os: freebsd hosts: * calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive) @@ -77,9 +77,9 @@ component: Memory alarm: ram_available on: system.ram - class: System + class: Utilization + type: System component: Memory - type: Utilization os: freebsd hosts: * calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers) diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf index e8b289942..dfb771e8c 100644 --- a/health/health.d/redis.conf +++ b/health/health.d/redis.conf @@ -1,26 +1,10 @@ -# make sure redis is running - - template: redis_last_collected_secs - on: redis.operations - class: KV Storage -component: Redis - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba - template: redis_bgsave_broken families: * on: redis.bgsave_health - class: KV Storage + class: Errors + type: KV Storage component: Redis - type: Errors every: 10s crit: $rdb_last_bgsave_status != 0 units: ok/failed @@ -31,9 +15,9 @@ component: Redis template: redis_bgsave_slow families: * on: redis.bgsave_now - class: KV Storage + class: Latency + type: KV Storage component: Redis - type: Latency every: 10s warn: $rdb_bgsave_in_progress > 600 crit: $rdb_bgsave_in_progress > 1200 diff --git a/health/health.d/retroshare.conf b/health/health.d/retroshare.conf index ca22e60de..14aa76b4c 100644 --- a/health/health.d/retroshare.conf +++ b/health/health.d/retroshare.conf @@ -1,26 +1,11 @@ -# make sure RetroShare is running - - template: retroshare_last_collected_secs - on: retroshare.peers - class: Data Sharing -component: Retroshare - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin # make sure the DHT is fine when active template: retroshare_dht_working on: retroshare.dht - class: Data Sharing + class: Utilization + type: Data Sharing component: Retroshare - type: Utilization calc: $dht_size_all units: peers every: 1m diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf index b2c0e8d9c..261fd48c6 100644 --- a/health/health.d/riakkv.conf +++ b/health/health.d/riakkv.conf @@ -1,24 +1,10 @@ -# Ensure that Riak is running. template: riak_last_collected_secs - template: riakkv_last_collected_secs - on: riak.kv.throughput - class: Database -component: Riak KV - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba # Warn if a list keys operation is running. template: riakkv_list_keys_active on: riak.core.fsm_active - class: Database + class: Utilization + type: Database component: Riak KV - type: Utilization calc: $list_fsm_active units: state machines every: 10s @@ -31,9 +17,9 @@ component: Riak KV # KV GET template: riakkv_1h_kv_get_mean_latency on: riak.kv.latency.get - class: Database + class: Latency + type: Database component: Riak KV - type: Latency calc: $node_get_fsm_time_mean lookup: average -1h unaligned of time every: 30s @@ -43,9 +29,9 @@ component: Riak KV template: riakkv_kv_get_slow on: riak.kv.latency.get - class: Database + class: Latency + type: Database component: Riak KV - type: Latency calc: $mean lookup: average -3m unaligned of time units: ms @@ -61,9 +47,9 @@ component: Riak KV # KV PUT template: riakkv_1h_kv_put_mean_latency on: riak.kv.latency.put - class: Database + class: Latency + type: Database component: Riak KV - type: Latency calc: $node_put_fsm_time_mean lookup: average -1h unaligned of time every: 30s @@ -73,9 +59,9 @@ component: Riak KV template: riakkv_kv_put_slow on: riak.kv.latency.put - class: Database + class: Latency + type: Database component: Riak KV - type: Latency calc: $mean lookup: average -3m unaligned of time units: ms @@ -95,9 +81,9 @@ component: Riak KV # On systems observed, this is < 2000, but may grow depending on load. template: riakkv_vm_high_process_count on: riak.vm - class: Database + class: Utilization + type: Database component: Riak KV - type: Utilization calc: $sys_process_count units: processes every: 10s diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf index 3c0dc1168..ab110bf07 100644 --- a/health/health.d/scaleio.conf +++ b/health/health.d/scaleio.conf @@ -1,27 +1,11 @@ -# make sure scaleio is running - - template: scaleio_last_collected_secs - on: scaleio.system_capacity_total - class: Storage -component: ScaleIO - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - # make sure Storage Pool capacity utilization is under limit template: scaleio_storage_pool_capacity_utilization on: scaleio.storage_pool_capacity_utilization - class: Storage + class: Utilization + type: Storage component: ScaleIO - type: Utilization calc: $used units: % every: 10s @@ -36,9 +20,9 @@ component: ScaleIO template: scaleio_sdc_mdm_connection_state on: scaleio.sdc_mdm_connection_state - class: Storage + class: Utilization + type: Storage component: ScaleIO - type: Utilization calc: $connected every: 10s warn: $this != 1 diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf index d8b01caff..345f87505 100644 --- a/health/health.d/softnet.conf +++ b/health/health.d/softnet.conf @@ -5,9 +5,9 @@ alarm: 1min_netdev_backlog_exceeded on: system.softnet_stat - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * lookup: average -1m unaligned absolute of dropped @@ -21,9 +21,9 @@ component: Network alarm: 1min_netdev_budget_ran_outs on: system.softnet_stat - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * lookup: average -1m unaligned absolute of squeezed @@ -38,9 +38,9 @@ component: Network alarm: 10min_netisr_backlog_exceeded on: system.softnet_stat - class: System + class: Errors + type: System component: Network - type: Errors os: freebsd hosts: * lookup: average -1m unaligned absolute of qdrops diff --git a/health/health.d/squid.conf b/health/health.d/squid.conf deleted file mode 100644 index 5c3d17629..000000000 --- a/health/health.d/squid.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure squid is running - - template: squid_last_collected_secs - on: squid.clients_requests - class: Web Proxy -component: Squid - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: proxyadmin - diff --git a/health/health.d/stiebeleltron.conf b/health/health.d/stiebeleltron.conf index f793b5ed1..493c8b73a 100644 --- a/health/health.d/stiebeleltron.conf +++ b/health/health.d/stiebeleltron.conf @@ -1,9 +1,9 @@ template: stiebeleltron_last_collected_secs families: * on: stiebeleltron.heating.hc1 - class: Other + class: Latency + type: Other component: Sensors - type: Latency calc: $now - $last_collected_t every: 10s units: seconds ago diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf index 5b3f89a97..03c319320 100644 --- a/health/health.d/swap.conf +++ b/health/health.d/swap.conf @@ -3,9 +3,9 @@ alarm: 30min_ram_swapped_out on: system.swapio - class: System + class: Workload + type: System component: Memory - type: Workload os: linux freebsd hosts: * lookup: sum -30m unaligned absolute of out @@ -20,12 +20,12 @@ component: Memory alarm: used_swap on: system.swap - class: System + class: Utilization + type: System component: Memory - type: Utilization os: linux freebsd hosts: * - calc: $used * 100 / ( $used + $free ) + calc: ($used + $free) > 0 ? ($used * 100 / ($used + $free)) : 0 units: % every: 10s warn: $this > (($status >= $WARNING) ? (80) : (90)) diff --git a/health/health.d/systemdunits.conf b/health/health.d/systemdunits.conf index cc1a8698d..38213a8db 100644 --- a/health/health.d/systemdunits.conf +++ b/health/health.d/systemdunits.conf @@ -4,9 +4,9 @@ ## Service units template: systemd_service_units_state on: systemd.service_units_state - class: Linux + class: Errors + type: Linux component: Systemd units - type: Errors lookup: max -1s min2max units: ok/failed every: 10s @@ -18,9 +18,9 @@ component: Systemd units ## Socket units template: systemd_socket_units_state on: systemd.socket_unit_state - class: Linux + class: Errors + type: Linux component: Systemd units - type: Errors lookup: max -1s min2max units: ok/failed every: 10s @@ -32,9 +32,9 @@ component: Systemd units ## Target units template: systemd_target_units_state on: systemd.target_unit_state - class: Linux + class: Errors + type: Linux component: Systemd units - type: Errors lookup: max -1s min2max units: ok/failed every: 10s @@ -46,9 +46,9 @@ component: Systemd units ## Path units template: systemd_path_units_state on: systemd.path_unit_state - class: Linux + class: Errors + type: Linux component: Systemd units - type: Errors lookup: max -1s min2max units: ok/failed every: 10s @@ -60,9 +60,9 @@ component: Systemd units ## Device units template: systemd_device_units_state on: systemd.device_unit_state - class: Linux + class: Errors + type: Linux component: Systemd units - type: Errors lookup: max -1s min2max units: ok/failed every: 10s @@ -74,9 +74,9 @@ component: Systemd units ## Mount units template: systemd_mount_units_state on: systemd.mount_unit_state - class: Linux + class: Errors + type: Linux component: Systemd units - type: Errors lookup: max -1s min2max units: ok/failed every: 10s @@ -88,9 +88,9 @@ component: Systemd units ## Automount units template: systemd_automount_units_state on: systemd.automount_unit_state - class: Linux + class: Errors + type: Linux component: Systemd units - type: Errors lookup: max -1s min2max units: ok/failed every: 10s @@ -102,9 +102,9 @@ component: Systemd units ## Swap units template: systemd_swap_units_state on: systemd.swap_unit_state - class: Linux + class: Errors + type: Linux component: Systemd units - type: Errors lookup: max -1s min2max units: ok/failed every: 10s @@ -116,9 +116,9 @@ component: Systemd units ## Scope units template: systemd_scope_units_state on: systemd.scope_unit_state - class: Linux + class: Errors + type: Linux component: Systemd units - type: Errors lookup: max -1s min2max units: ok/failed every: 10s @@ -130,9 +130,9 @@ component: Systemd units ## Slice units template: systemd_slice_units_state on: systemd.slice_unit_state - class: Linux + class: Errors + type: Linux component: Systemd units - type: Errors lookup: max -1s min2max units: ok/failed every: 10s diff --git a/health/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf index f2c5e4e5d..67b3bee53 100644 --- a/health/health.d/tcp_conn.conf +++ b/health/health.d/tcp_conn.conf @@ -7,9 +7,9 @@ alarm: tcp_connections on: ipv4.tcpsock - class: System + class: Workload + type: System component: Network - type: Workload os: linux hosts: * calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0 diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf index 51a0e461c..d4bcfa248 100644 --- a/health/health.d/tcp_listen.conf +++ b/health/health.d/tcp_listen.conf @@ -20,9 +20,9 @@ alarm: 1m_tcp_accept_queue_overflows on: ip.tcp_accept_queue - class: System + class: Workload + type: System component: Network - type: Workload os: linux hosts: * lookup: average -60s unaligned absolute of ListenOverflows @@ -38,9 +38,9 @@ component: Network # CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842 alarm: 1m_tcp_accept_queue_drops on: ip.tcp_accept_queue - class: System + class: Workload + type: System component: Network - type: Workload os: linux hosts: * lookup: average -60s unaligned absolute of ListenDrops @@ -63,9 +63,9 @@ component: Network alarm: 1m_tcp_syn_queue_drops on: ip.tcp_syn_queue - class: System + class: Workload + type: System component: Network - type: Workload os: linux hosts: * lookup: average -60s unaligned absolute of TCPReqQFullDrop @@ -80,9 +80,9 @@ component: Network alarm: 1m_tcp_syn_queue_cookies on: ip.tcp_syn_queue - class: System + class: Workload + type: System component: Network - type: Workload os: linux hosts: * lookup: average -60s unaligned absolute of TCPReqQFullDoCookies diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf index 646e5c6da..318be20ac 100644 --- a/health/health.d/tcp_mem.conf +++ b/health/health.d/tcp_mem.conf @@ -8,9 +8,9 @@ alarm: tcp_memory on: ipv4.sockstat_tcp_mem - class: System + class: Utilization + type: System component: Network - type: Utilization os: linux hosts: * calc: ${mem} * 100 / ${tcp_mem_high} diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf index 6e94d67d1..cbd628da5 100644 --- a/health/health.d/tcp_orphans.conf +++ b/health/health.d/tcp_orphans.conf @@ -9,9 +9,9 @@ alarm: tcp_orphans on: ipv4.sockstat_tcp_sockets - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * calc: ${orphan} * 100 / ${tcp_max_orphans} diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf index 41355dad6..190271e47 100644 --- a/health/health.d/tcp_resets.conf +++ b/health/health.d/tcp_resets.conf @@ -6,9 +6,9 @@ alarm: 1m_ipv4_tcp_resets_sent on: ipv4.tcphandshake - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * lookup: average -1m at -10s unaligned absolute of OutRsts @@ -18,9 +18,9 @@ component: Network alarm: 10s_ipv4_tcp_resets_sent on: ipv4.tcphandshake - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * lookup: average -10s unaligned absolute of OutRsts @@ -40,9 +40,9 @@ component: Network alarm: 1m_ipv4_tcp_resets_received on: ipv4.tcphandshake - class: System + class: Errors + type: System component: Network - type: Errors os: linux freebsd hosts: * lookup: average -1m at -10s unaligned absolute of AttemptFails @@ -52,9 +52,9 @@ component: Network alarm: 10s_ipv4_tcp_resets_received on: ipv4.tcphandshake - class: System + class: Errors + type: System component: Network - type: Errors os: linux freebsd hosts: * lookup: average -10s unaligned absolute of AttemptFails diff --git a/health/health.d/timex.conf b/health/health.d/timex.conf new file mode 100644 index 000000000..ea90c4000 --- /dev/null +++ b/health/health.d/timex.conf @@ -0,0 +1,17 @@ + +# It can take several minutes before ntpd selects a server to synchronize with; +# try checking after 17 minutes (1024 seconds). + + alarm: system_clock_sync_state + on: system.clock_sync_state + os: linux + class: Error + type: System +component: Clock + calc: $state + units: synchronization state + every: 10s + warn: $system.uptime.uptime > 17 * 60 AND $this == 0 + delay: down 5m + info: the system time is not synchronized to a reliable server + to: silent diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf index 342a1aedd..64f47dfa7 100644 --- a/health/health.d/udp_errors.conf +++ b/health/health.d/udp_errors.conf @@ -6,9 +6,9 @@ alarm: 1m_ipv4_udp_receive_buffer_errors on: ipv4.udperrors - class: System + class: Errors + type: System component: Network - type: Errors os: linux freebsd hosts: * lookup: average -1m unaligned absolute of RcvbufErrors @@ -24,9 +24,9 @@ component: Network alarm: 1m_ipv4_udp_send_buffer_errors on: ipv4.udperrors - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * lookup: average -1m unaligned absolute of SndbufErrors diff --git a/health/health.d/unbound.conf b/health/health.d/unbound.conf index 1df15474f..4e8d164d2 100644 --- a/health/health.d/unbound.conf +++ b/health/health.d/unbound.conf @@ -1,27 +1,11 @@ -# make sure unbound is running - - template: unbound_last_collected_secs - on: unbound.queries - class: DNS -component: Unbound - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - # make sure there is no overwritten/dropped queries in the request-list template: unbound_request_list_overwritten on: unbound.request_list_jostle_list - class: DNS + class: Errors + type: DNS component: Unbound - type: Errors lookup: average -60s unaligned absolute match-names of overwritten units: queries every: 10s @@ -32,9 +16,9 @@ component: Unbound template: unbound_request_list_dropped on: unbound.request_list_jostle_list - class: DNS + class: Errors + type: DNS component: Unbound - type: Errors lookup: average -60s unaligned absolute match-names of dropped units: queries every: 10s diff --git a/health/health.d/varnish.conf b/health/health.d/varnish.conf deleted file mode 100644 index 7f3bd6c82..000000000 --- a/health/health.d/varnish.conf +++ /dev/null @@ -1,12 +0,0 @@ - alarm: varnish_last_collected - on: varnish.uptime - class: Web Proxy -component: Varnish - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - info: number of seconds since the last successful data collection - to: sysadmin diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf index 8538e488c..a9cc7ceef 100644 --- a/health/health.d/vcsa.conf +++ b/health/health.d/vcsa.conf @@ -1,20 +1,4 @@ -# make sure vcsa is running and responding - - template: vcsa_last_collected_secs - on: vcsa.system_health - class: Virtual Machine -component: VMware vCenter - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - # Overall system health: # - 0: all components are healthy. # - 1: one or more components might become overloaded soon. @@ -24,9 +8,9 @@ component: VMware vCenter template: vcsa_system_health on: vcsa.system_health - class: Virtual Machine + class: Errors + type: Virtual Machine component: VMware vCenter - type: Errors lookup: max -10s unaligned of system units: status every: 10s @@ -46,9 +30,9 @@ component: VMware vCenter template: vcsa_swap_health on: vcsa.components_health - class: Virtual Machine + class: Errors + type: Virtual Machine component: VMware vCenter - type: Errors lookup: max -10s unaligned of swap units: status every: 10s @@ -61,9 +45,9 @@ component: VMware vCenter template: vcsa_storage_health on: vcsa.components_health - class: Virtual Machine + class: Errors + type: Virtual Machine component: VMware vCenter - type: Errors lookup: max -10s unaligned of storage units: status every: 10s @@ -76,9 +60,9 @@ component: VMware vCenter template: vcsa_mem_health on: vcsa.components_health - class: Virtual Machine + class: Errors + type: Virtual Machine component: VMware vCenter - type: Errors lookup: max -10s unaligned of mem units: status every: 10s @@ -91,9 +75,9 @@ component: VMware vCenter template: vcsa_load_health on: vcsa.components_health - class: Virtual Machine + class: Utilization + type: Virtual Machine component: VMware vCenter - type: Utilization lookup: max -10s unaligned of load units: status every: 10s @@ -106,9 +90,9 @@ component: VMware vCenter template: vcsa_database_storage_health on: vcsa.components_health - class: Virtual Machine + class: Errors + type: Virtual Machine component: VMware vCenter - type: Errors lookup: max -10s unaligned of database_storage units: status every: 10s @@ -121,9 +105,9 @@ component: VMware vCenter template: vcsa_applmgmt_health on: vcsa.components_health - class: Virtual Machine + class: Errors + type: Virtual Machine component: VMware vCenter - type: Errors lookup: max -10s unaligned of applmgmt units: status every: 10s @@ -143,9 +127,9 @@ component: VMware vCenter template: vcsa_software_updates_health on: vcsa.software_updates_health - class: Virtual Machine + class: Errors + type: Virtual Machine component: VMware vCenter - type: Errors lookup: max -10s unaligned of software_packages units: status every: 10s diff --git a/health/health.d/vernemq.conf b/health/health.d/vernemq.conf index 737147f38..cfbe2a524 100644 --- a/health/health.d/vernemq.conf +++ b/health/health.d/vernemq.conf @@ -1,27 +1,11 @@ -# Availability - - template: vernemq_last_collected_secs - on: vernemq.node_uptime - class: Messaging -component: VerneMQ - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - # Socket errors template: vernemq_socket_errors on: vernemq.socket_errors - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: sum -1m unaligned absolute of socket_error units: errors every: 1m @@ -34,9 +18,9 @@ component: VerneMQ template: vernemq_queue_message_drop on: vernemq.queue_undelivered_messages - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute of queue_message_drop units: dropped messages every: 1m @@ -47,9 +31,9 @@ component: VerneMQ template: vernemq_queue_message_expired on: vernemq.queue_undelivered_messages - class: Messaging + class: Latency + type: Messaging component: VerneMQ - type: Latency lookup: average -1m unaligned absolute of queue_message_expired units: expired messages every: 1m @@ -60,9 +44,9 @@ component: VerneMQ template: vernemq_queue_message_unhandled on: vernemq.queue_undelivered_messages - class: Messaging + class: Latency + type: Messaging component: VerneMQ - type: Latency lookup: average -1m unaligned absolute of queue_message_unhandled units: unhandled messages every: 1m @@ -75,9 +59,9 @@ component: VerneMQ template: vernemq_average_scheduler_utilization on: vernemq.average_scheduler_utilization - class: Messaging + class: Utilization + type: Messaging component: VerneMQ - type: Utilization lookup: average -10m unaligned units: % every: 1m @@ -91,9 +75,9 @@ component: VerneMQ template: vernemq_cluster_dropped on: vernemq.cluster_dropped - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: sum -1m unaligned units: KiB every: 1m @@ -104,9 +88,9 @@ component: VerneMQ template: vernemq_netsplits on: vernemq.netsplits - class: Messaging + class: Workload + type: Messaging component: VerneMQ - type: Workload lookup: sum -1m unaligned absolute of netsplit_detected units: netsplits every: 10s @@ -119,9 +103,9 @@ component: VerneMQ template: vernemq_mqtt_connack_sent_reason_unsuccessful on: vernemq.mqtt_connack_sent_reason - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute match-names of !success,* units: packets every: 1m @@ -134,9 +118,9 @@ component: VerneMQ template: vernemq_mqtt_disconnect_received_reason_not_normal on: vernemq.mqtt_disconnect_received_reason - class: Messaging + class: Workload + type: Messaging component: VerneMQ - type: Workload lookup: average -1m unaligned absolute match-names of !normal_disconnect,* units: packets every: 1m @@ -147,9 +131,9 @@ component: VerneMQ template: vernemq_mqtt_disconnect_sent_reason_not_normal on: vernemq.mqtt_disconnect_sent_reason - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute match-names of !normal_disconnect,* units: packets every: 1m @@ -162,9 +146,9 @@ component: VerneMQ template: vernemq_mqtt_subscribe_error on: vernemq.mqtt_subscribe_error - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute units: failed ops every: 1m @@ -175,9 +159,9 @@ component: VerneMQ template: vernemq_mqtt_subscribe_auth_error on: vernemq.mqtt_subscribe_auth_error - class: Messaging + class: Workload + type: Messaging component: VerneMQ - type: Workload lookup: average -1m unaligned absolute units: attempts every: 1m @@ -190,9 +174,9 @@ component: VerneMQ template: vernemq_mqtt_unsubscribe_error on: vernemq.mqtt_unsubscribe_error - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute units: failed ops every: 1m @@ -205,9 +189,9 @@ component: VerneMQ template: vernemq_mqtt_publish_errors on: vernemq.mqtt_publish_errors - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute units: failed ops every: 1m @@ -218,9 +202,9 @@ component: VerneMQ template: vernemq_mqtt_publish_auth_errors on: vernemq.mqtt_publish_auth_errors - class: Messaging + class: Workload + type: Messaging component: VerneMQ - type: Workload lookup: average -1m unaligned absolute units: attempts every: 1m @@ -233,9 +217,9 @@ component: VerneMQ template: vernemq_mqtt_puback_received_reason_unsuccessful on: vernemq.mqtt_puback_received_reason - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute match-names of !success,* units: packets every: 1m @@ -246,9 +230,9 @@ component: VerneMQ template: vernemq_mqtt_puback_sent_reason_unsuccessful on: vernemq.mqtt_puback_sent_reason - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute match-names of !success,* units: packets every: 1m @@ -259,9 +243,9 @@ component: VerneMQ template: vernemq_mqtt_puback_unexpected on: vernemq.mqtt_puback_invalid_error - class: Messaging + class: Workload + type: Messaging component: VerneMQ - type: Workload lookup: average -1m unaligned absolute units: messages every: 1m @@ -274,9 +258,9 @@ component: VerneMQ template: vernemq_mqtt_pubrec_received_reason_unsuccessful on: vernemq.mqtt_pubrec_received_reason - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute match-names of !success,* units: packets every: 1m @@ -287,9 +271,9 @@ component: VerneMQ template: vernemq_mqtt_pubrec_sent_reason_unsuccessful on: vernemq.mqtt_pubrec_sent_reason - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute match-names of !success,* units: packets every: 1m @@ -300,9 +284,9 @@ component: VerneMQ template: vernemq_mqtt_pubrec_invalid_error on: vernemq.mqtt_pubrec_invalid_error - class: Messaging + class: Workload + type: Messaging component: VerneMQ - type: Workload lookup: average -1m unaligned absolute units: messages every: 1m @@ -315,9 +299,9 @@ component: VerneMQ template: vernemq_mqtt_pubrel_received_reason_unsuccessful on: vernemq.mqtt_pubrel_received_reason - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute match-names of !success,* units: packets every: 1m @@ -328,9 +312,9 @@ component: VerneMQ template: vernemq_mqtt_pubrel_sent_reason_unsuccessful on: vernemq.mqtt_pubrel_sent_reason - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute match-names of !success,* units: packets every: 1m @@ -343,9 +327,9 @@ component: VerneMQ template: vernemq_mqtt_pubcomp_received_reason_unsuccessful on: vernemq.mqtt_pubcomp_received_reason - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute match-names of !success,* units: packets every: 1m @@ -356,9 +340,9 @@ component: VerneMQ template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful on: vernemq.mqtt_pubcomp_sent_reason - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute match-names of !success,* units: packets every: 1m @@ -369,9 +353,9 @@ component: VerneMQ template: vernemq_mqtt_pubcomp_unexpected on: vernemq.mqtt_pubcomp_invalid_error - class: Messaging + class: Workload + type: Messaging component: VerneMQ - type: Workload lookup: average -1m unaligned absolute units: messages every: 1m diff --git a/health/health.d/vsphere.conf b/health/health.d/vsphere.conf index aee7c5cd4..d8fc899b9 100644 --- a/health/health.d/vsphere.conf +++ b/health/health.d/vsphere.conf @@ -6,9 +6,9 @@ template: vsphere_vm_mem_usage on: vsphere.vm_mem_usage_percentage - class: Virtual Machine + class: Utilization + type: Virtual Machine component: Memory - type: Utilization hosts: * calc: $used units: % @@ -23,9 +23,9 @@ component: Memory template: vsphere_host_mem_usage on: vsphere.host_mem_usage_percentage - class: Virtual Machine + class: Utilization + type: Virtual Machine component: Memory - type: Utilization hosts: * calc: $used units: % @@ -39,9 +39,9 @@ component: Memory template: vsphere_inbound_packets_errors on: vsphere.net_errors_total - class: Virtual Machine + class: Errors + type: Virtual Machine component: Network - type: Errors hosts: * families: * lookup: sum -10m unaligned absolute match-names of rx @@ -51,9 +51,9 @@ component: Network template: vsphere_outbound_packets_errors on: vsphere.net_errors_total - class: Virtual Machine + class: Errors + type: Virtual Machine component: Network - type: Errors hosts: * families: * lookup: sum -10m unaligned absolute match-names of tx @@ -65,9 +65,9 @@ component: Network template: vsphere_inbound_packets_errors_ratio on: vsphere.net_packets_total - class: Virtual Machine + class: Errors + type: Virtual Machine component: Network - type: Errors hosts: * families: * lookup: sum -10m unaligned absolute match-names of rx @@ -81,9 +81,9 @@ component: Network template: vsphere_outbound_packets_errors_ratio on: vsphere.net_packets_total - class: Virtual Machine + class: Errors + type: Virtual Machine component: Network - type: Errors hosts: * families: * lookup: sum -10m unaligned absolute match-names of tx @@ -100,9 +100,9 @@ component: Network template: vsphere_cpu_usage on: vsphere.cpu_usage_total - class: Virtual Machine + class: Utilization + type: Virtual Machine component: CPU - type: Utilization hosts: * lookup: average -10m unaligned match-names of used units: % @@ -117,9 +117,9 @@ component: CPU template: vsphere_inbound_packets_dropped on: vsphere.net_drops_total - class: Virtual Machine + class: Errors + type: Virtual Machine component: Network - type: Errors hosts: * families: * lookup: sum -10m unaligned absolute match-names of rx @@ -129,9 +129,9 @@ component: Network template: vsphere_outbound_packets_dropped on: vsphere.net_drops_total - class: Virtual Machine + class: Errors + type: Virtual Machine component: Network - type: Errors hosts: * families: * lookup: sum -10m unaligned absolute match-names of tx @@ -143,9 +143,9 @@ component: Network template: vsphere_inbound_packets_dropped_ratio on: vsphere.net_packets_total - class: Virtual Machine + class: Errors + type: Virtual Machine component: Network - type: Errors hosts: * families: * lookup: sum -10m unaligned absolute match-names of rx @@ -159,9 +159,9 @@ component: Network template: vsphere_outbound_packets_dropped_ratio on: vsphere.net_packets_total - class: Virtual Machine + class: Errors + type: Virtual Machine component: Network - type: Errors hosts: * families: * lookup: sum -10m unaligned absolute match-names of tx diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf index 127c9a9c6..454e0abef 100644 --- a/health/health.d/web_log.conf +++ b/health/health.d/web_log.conf @@ -1,22 +1,4 @@ -# make sure we can collect web log data - - template: last_collected_secs - on: web_log.response_codes - class: Web Server -component: Web log - type: Latency - families: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - - # ----------------------------------------------------------------------------- # high level response code alarms @@ -29,9 +11,9 @@ component: Web log template: 1m_requests on: web_log.response_statuses - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: sum -1m unaligned calc: ($this == 0)?(1):($this) @@ -41,9 +23,9 @@ component: Web log template: 1m_successful on: web_log.response_statuses - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: sum -1m unaligned of successful_requests calc: $this * 100 / $1m_requests @@ -57,41 +39,39 @@ component: Web log template: 1m_redirects on: web_log.response_statuses - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: sum -1m unaligned of redirects calc: $this * 100 / $1m_requests units: % every: 10s warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 ) - crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h info: ratio of redirection HTTP requests over the last minute (3xx except 304) to: webmaster template: 1m_bad_requests on: web_log.response_statuses - class: Web Server + class: Errors + type: Web Server component: Web log - type: Errors families: * lookup: sum -1m unaligned of bad_requests calc: $this * 100 / $1m_requests units: % every: 10s warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 ) - crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h info: ratio of client error HTTP requests over the last minute (4xx except 401) to: webmaster template: 1m_internal_errors on: web_log.response_statuses - class: Web Server + class: Errors + type: Web Server component: Web log - type: Errors families: * lookup: sum -1m unaligned of server_errors calc: $this * 100 / $1m_requests @@ -114,9 +94,9 @@ component: Web log template: 1m_total_requests on: web_log.response_codes - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: sum -1m unaligned calc: ($this == 0)?(1):($this) @@ -126,9 +106,9 @@ component: Web log template: 1m_unmatched on: web_log.response_codes - class: Web Server + class: Errors + type: Web Server component: Web log - type: Errors families: * lookup: sum -1m unaligned of unmatched calc: $this * 100 / $1m_total_requests @@ -151,9 +131,9 @@ component: Web log template: 10m_response_time on: web_log.response_time - class: System + class: Latency + type: System component: Web log - type: Latency families: * lookup: average -10m unaligned of avg units: ms @@ -162,9 +142,9 @@ component: Web log template: web_slow on: web_log.response_time - class: Web Server + class: Latency + type: Web Server component: Web log - type: Latency families: * lookup: average -1m unaligned of avg units: ms @@ -191,9 +171,9 @@ component: Web log template: 5m_successful_old on: web_log.response_statuses - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: average -5m at -5m unaligned of successful_requests units: requests/s @@ -202,9 +182,9 @@ component: Web log template: 5m_successful on: web_log.response_statuses - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: average -5m unaligned of successful_requests units: requests/s @@ -213,9 +193,9 @@ component: Web log template: 5m_requests_ratio on: web_log.response_codes - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100) units: % @@ -233,23 +213,6 @@ component: Web log # ---------------------------------------------------GO-VERSION--------------------------------------------------------- -# make sure we can collect web log data - - template: web_log_last_collected_secs - on: web_log.requests - class: Web Server -component: Web log - type: Latency - families: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - # unmatched lines # the following alarms trigger only when there are enough data. @@ -261,9 +224,9 @@ component: Web log template: web_log_1m_total_requests on: web_log.requests - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: sum -1m unaligned calc: ($this == 0)?(1):($this) @@ -273,9 +236,9 @@ component: Web log template: web_log_1m_unmatched on: web_log.excluded_requests - class: Web Server + class: Errors + type: Web Server component: Web log - type: Errors families: * lookup: sum -1m unaligned of unmatched calc: $this * 100 / $web_log_1m_total_requests @@ -298,9 +261,9 @@ component: Web log template: web_log_1m_requests on: web_log.type_requests - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: sum -1m unaligned calc: ($this == 0)?(1):($this) @@ -310,9 +273,9 @@ component: Web log template: web_log_1m_successful on: web_log.type_requests - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: sum -1m unaligned of success calc: $this * 100 / $web_log_1m_requests @@ -326,41 +289,39 @@ component: Web log template: web_log_1m_redirects on: web_log.type_requests - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: sum -1m unaligned of redirect calc: $this * 100 / $web_log_1m_requests units: % every: 10s warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 ) - crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h info: ratio of redirection HTTP requests over the last minute (3xx except 304) to: webmaster template: web_log_1m_bad_requests on: web_log.type_requests - class: Web Server + class: Errors + type: Web Server component: Web log - type: Errors families: * lookup: sum -1m unaligned of bad calc: $this * 100 / $web_log_1m_requests units: % every: 10s warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 ) - crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h info: ratio of client error HTTP requests over the last minute (4xx except 401) to: webmaster template: web_log_1m_internal_errors on: web_log.type_requests - class: Web Server + class: Errors + type: Web Server component: Web log - type: Errors families: * lookup: sum -1m unaligned of error calc: $this * 100 / $web_log_1m_requests @@ -384,9 +345,9 @@ component: Web log template: web_log_10m_response_time on: web_log.request_processing_time - class: System + class: Latency + type: System component: Web log - type: Latency families: * lookup: average -10m unaligned of avg units: ms @@ -395,9 +356,9 @@ component: Web log template: web_log_web_slow on: web_log.request_processing_time - class: Web Server + class: Latency + type: Web Server component: Web log - type: Latency families: * lookup: average -1m unaligned of avg units: ms @@ -424,9 +385,9 @@ component: Web log template: web_log_5m_successful_old on: web_log.type_requests - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: average -5m at -5m unaligned of success units: requests/s @@ -435,9 +396,9 @@ component: Web log template: web_log_5m_successful on: web_log.type_requests - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: average -5m unaligned of success units: requests/s @@ -446,9 +407,9 @@ component: Web log template: web_log_5m_requests_ratio on: web_log.type_requests - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * calc: ($web_log_5m_successful_old > 0)?($web_log_5m_successful * 100 / $web_log_5m_successful_old):(100) units: % diff --git a/health/health.d/whoisquery.conf b/health/health.d/whoisquery.conf index c6d3a9de0..be5eb58f9 100644 --- a/health/health.d/whoisquery.conf +++ b/health/health.d/whoisquery.conf @@ -1,26 +1,9 @@ -# make sure whoisquery is running - - template: whoisquery_last_collected_secs - on: whoisquery.time_until_expiration - class: Other -component: WHOIS - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 60s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - - template: whoisquery_days_until_expiration on: whoisquery.time_until_expiration - class: Other + class: Utilization + type: Other component: WHOIS - type: Utilization calc: $expiry units: seconds every: 60s diff --git a/health/health.d/wmi.conf b/health/health.d/wmi.conf index 6bd4e077f..90d39ce9d 100644 --- a/health/health.d/wmi.conf +++ b/health/health.d/wmi.conf @@ -1,29 +1,11 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - -## Availability - - template: wmi_last_collected_secs - on: cpu.collector_duration - class: Windows -component: Availability - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - ## CPU template: wmi_10min_cpu_usage on: wmi.cpu_utilization_total - class: Windows + class: Utilization + type: Windows component: CPU - type: Utilization os: linux hosts: * lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt @@ -40,9 +22,9 @@ component: CPU template: wmi_ram_in_use on: wmi.memory_utilization - class: Windows + class: Utilization + type: Windows component: Memory - type: Utilization os: linux hosts: * calc: ($used) * 100 / ($used + $available) @@ -56,9 +38,9 @@ component: Memory template: wmi_swap_in_use on: wmi.memory_swap_utilization - class: Windows + class: Utilization + type: Windows component: Memory - type: Utilization os: linux hosts: * calc: ($used) * 100 / ($used + $available) @@ -75,9 +57,9 @@ component: Memory template: wmi_inbound_packets_discarded on: wmi.net_discarded - class: Windows + class: Errors + type: Windows component: Network - type: Errors os: linux hosts: * families: * @@ -91,9 +73,9 @@ component: Network template: wmi_outbound_packets_discarded on: wmi.net_discarded - class: Windows + class: Errors + type: Windows component: Network - type: Errors os: linux hosts: * families: * @@ -107,9 +89,9 @@ component: Network template: wmi_inbound_packets_errors on: wmi.net_errors - class: Windows + class: Errors + type: Windows component: Network - type: Errors os: linux hosts: * families: * @@ -123,9 +105,9 @@ component: Network template: wmi_outbound_packets_errors on: wmi.net_errors - class: Windows + class: Errors + type: Windows component: Network - type: Errors os: linux hosts: * families: * @@ -142,9 +124,9 @@ component: Network template: wmi_disk_in_use on: wmi.logical_disk_utilization - class: Windows + class: Utilization + type: Windows component: Disk - type: Utilization os: linux hosts: * calc: ($used) * 100 / ($used + $free) diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf index 93c406b7a..fc69d0288 100644 --- a/health/health.d/x509check.conf +++ b/health/health.d/x509check.conf @@ -1,26 +1,9 @@ -# make sure x509check is running - - template: x509check_last_collected_secs - on: x509check.time_until_expiration - class: Certificates -component: x509 certificates - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 60s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - - template: x509check_days_until_expiration on: x509check.time_until_expiration - class: Certificates + class: Latency + type: Certificates component: x509 certificates - type: Latency calc: $expiry units: seconds every: 60s @@ -31,9 +14,9 @@ component: x509 certificates template: x509check_revocation_status on: x509check.revocation_status - class: Certificates + class: Errors + type: Certificates component: x509 certificates - type: Errors calc: $revoked every: 60s crit: $this != nan AND $this != 0 diff --git a/health/health.d/zfs.conf b/health/health.d/zfs.conf index d6f5fa2fe..785838d47 100644 --- a/health/health.d/zfs.conf +++ b/health/health.d/zfs.conf @@ -1,9 +1,9 @@ alarm: zfs_memory_throttle on: zfs.memory_ops - class: System + class: Utilization + type: System component: File system - type: Utilization lookup: sum -10m unaligned absolute of throttled units: events every: 1m @@ -16,9 +16,9 @@ component: File system template: zfs_pool_state_warn on: zfspool.state - class: System + class: Errors + type: System component: File system - type: Errors calc: $degraded units: boolean every: 10s @@ -29,9 +29,9 @@ component: File system template: zfs_pool_state_crit on: zfspool.state - class: System + class: Errors + type: System component: File system - type: Errors calc: $faulted + $unavail units: boolean every: 10s diff --git a/health/health.d/zookeeper.conf b/health/health.d/zookeeper.conf deleted file mode 100644 index 8c7d5a73d..000000000 --- a/health/health.d/zookeeper.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure zookeeper is running - - template: zookeeper_last_collected_secs - on: zookeeper.requests - class: KV Storage -component: ZooKeeper - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - |