summaryrefslogtreecommitdiffstats
path: root/src/health/health.d
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-11-25 14:45:37 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-11-25 14:48:03 +0000
commite55403ed71282d7bfd8b56df219de3c28a8af064 (patch)
tree524889e5becb81643bf8741e3082955dca076f09 /src/health/health.d
parentReleasing debian version 1.47.5-1. (diff)
downloadnetdata-e55403ed71282d7bfd8b56df219de3c28a8af064.tar.xz
netdata-e55403ed71282d7bfd8b56df219de3c28a8af064.zip
Merging upstream version 2.0.3+dfsg:
- does not include dygraphs anymore (Closes: #923993) - does not include pako anymore (Closes: #1042533) - does not include dashboard binaries anymore (Closes: #1045145) Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/health/health.d')
-rw-r--r--src/health/health.d/anomalies.conf25
-rw-r--r--src/health/health.d/apcupsd.conf48
-rw-r--r--src/health/health.d/boinc.conf6
-rw-r--r--src/health/health.d/ceph.conf18
-rw-r--r--src/health/health.d/disks.conf28
-rw-r--r--src/health/health.d/net.conf24
-rw-r--r--src/health/health.d/vernemq.conf188
7 files changed, 159 insertions, 178 deletions
diff --git a/src/health/health.d/anomalies.conf b/src/health/health.d/anomalies.conf
deleted file mode 100644
index 80d63bb8d..000000000
--- a/src/health/health.d/anomalies.conf
+++ /dev/null
@@ -1,25 +0,0 @@
-## raise a warning alarm if an anomaly probability is consistently above 50%
-
-## "foreach" was removed, these alarms don't work anymore
-
-# template: anomalies_anomaly_probabilities
-# on: anomalies.probability
-# class: Errors
-# type: Netdata
-#component: ML
-# lookup: average -2m foreach *
-# every: 1m
-# warn: $this > 50
-# info: average anomaly probability over the last 2 minutes
-
-# raise a warning alarm if an anomaly flag is consistently firing
-
-# template: anomalies_anomaly_flags
-# on: anomalies.anomaly
-# class: Errors
-# type: Netdata
-#component: ML
-# lookup: sum -2m foreach *
-# every: 1m
-# warn: $this > 10
-# info: number of anomalies in the last 2 minutes
diff --git a/src/health/health.d/apcupsd.conf b/src/health/health.d/apcupsd.conf
index 5fd7aa112..58d3b214b 100644
--- a/src/health/health.d/apcupsd.conf
+++ b/src/health/health.d/apcupsd.conf
@@ -1,11 +1,11 @@
# you can disable an alarm notification by setting the 'to' line to: silent
- template: apcupsd_10min_ups_load
- on: apcupsd.load
+ template: apcupsd_ups_load_capacity
+ on: apcupsd.ups_load_capacity_utilization
class: Utilization
type: Power Supply
-component: UPS
- lookup: average -10m unaligned of percentage
+component: UPS device
+ lookup: average -10m unaligned of load
units: %
every: 1m
warn: $this > (($status >= $WARNING) ? (70) : (80))
@@ -14,13 +14,11 @@ component: UPS
info: APC UPS average load over the last 10 minutes
to: sitemgr
-# Discussion in https://github.com/netdata/netdata/pull/3928:
-# Fire the alarm as soon as it's going on battery (99% charge) and clear only when full.
- template: apcupsd_ups_charge
- on: apcupsd.charge
+ template: apcupsd_ups_battery_charge
+ on: apcupsd.ups_battery_charge
class: Errors
type: Power Supply
-component: UPS
+component: UPS device
lookup: average -60s unaligned of charge
units: %
every: 60s
@@ -32,7 +30,7 @@ component: UPS
to: sitemgr
template: apcupsd_last_collected_secs
- on: apcupsd.load
+ on: apcupsd.ups_status
class: Latency
type: Power Supply
component: UPS device
@@ -47,21 +45,21 @@ component: UPS device
#Send out a warning when SELFTEST code is BT or NG. Code descriptions can be found at:
#http://www.apcupsd.org/manual/#:~:text=or%20N/A.-,SELFTEST,-The%20results%20of
- template: apcupsd_selftest_warning
- on: apcupsd.selftest
+ template: apcupsd_ups_selftest_warning
+ on: apcupsd.ups_selftest
lookup: max -1s unaligned match-names of BT,NG
units: status
every: 10s
warn: $this == 1
delay: up 0 down 15m multiplier 1.5 max 1h
- info: APC UPS self-test failed due to insufficient battery capacity or due to overload.
+ info: APC UPS self-test failed due to insufficient battery capacity or due to overload
to: sitemgr
#Send out a warning when STATUS code is ONBATT,OVERLOAD,LOWBATT,REPLACEBATT,NOBATT,COMMLOST
#https://man.archlinux.org/man/apcaccess.8.en#:~:text=apcupsd%20was%20started-,STATUS,-%3A%20UPS%20status.%20One
- template: apcupsd_status_onbatt
- on: apcupsd.status
+ template: apcupsd_ups_status_onbatt
+ on: apcupsd.ups_status
lookup: max -1s unaligned match-names of ONBATT
units: status
every: 10s
@@ -70,8 +68,8 @@ component: UPS device
info: APC UPS has switched to battery power because the input power has failed
to: sitemgr
- template: apcupsd_status_overload
- on: apcupsd.status
+ template: apcupsd_ups_status_overload
+ on: apcupsd.ups_status
lookup: max -1s unaligned match-names of OVERLOAD
units: status
every: 10s
@@ -80,8 +78,8 @@ component: UPS device
info: APC UPS is overloaded and cannot supply enough power to the load
to: sitemgr
- template: apcupsd_status_lowbatt
- on: apcupsd.status
+ template: apcupsd_ups_status_lowbatt
+ on: apcupsd.ups_status
lookup: max -1s unaligned match-names of LOWBATT
units: status
every: 10s
@@ -90,8 +88,8 @@ component: UPS device
info: APC UPS battery is low and needs to be recharged
to: sitemgr
- template: apcupsd_status_replacebatt
- on: apcupsd.status
+ template: apcupsd_ups_status_replacebatt
+ on: apcupsd.ups_status
lookup: max -1s unaligned match-names of REPLACEBATT
units: status
every: 10s
@@ -100,8 +98,8 @@ component: UPS device
info: APC UPS battery has reached the end of its lifespan and needs to be replaced
to: sitemgr
- template: apcupsd_status_nobatt
- on: apcupsd.status
+ template: apcupsd_ups_status_nobatt
+ on: apcupsd.ups_status
lookup: max -1s unaligned match-names of NOBATT
units: status
every: 10s
@@ -110,8 +108,8 @@ component: UPS device
info: APC UPS has no battery
to: sitemgr
- template: apcupsd_status_commlost
- on: apcupsd.status
+ template: apcupsd_ups_status_commlost
+ on: apcupsd.ups_status
lookup: max -1s unaligned match-names of COMMLOST
units: status
every: 10s
diff --git a/src/health/health.d/boinc.conf b/src/health/health.d/boinc.conf
index 6fd987de1..987d20212 100644
--- a/src/health/health.d/boinc.conf
+++ b/src/health/health.d/boinc.conf
@@ -2,11 +2,11 @@
# Warn on any compute errors encountered.
template: boinc_compute_errors
- on: boinc.states
+ on: boinc.tasks_per_state
class: Errors
type: Computing
component: BOINC
- lookup: average -10m unaligned of comperror
+ lookup: average -10m unaligned of compute_error
units: tasks
every: 1m
warn: $this > 0
@@ -17,7 +17,7 @@ component: BOINC
# Warn on lots of upload errors
template: boinc_upload_errors
- on: boinc.states
+ on: boinc.tasks_per_state
class: Errors
type: Computing
component: BOINC
diff --git a/src/health/health.d/ceph.conf b/src/health/health.d/ceph.conf
index 44d351338..0048e2a7c 100644
--- a/src/health/health.d/ceph.conf
+++ b/src/health/health.d/ceph.conf
@@ -1,16 +1,16 @@
# low ceph disk available
- template: ceph_cluster_space_usage
- on: ceph.general_usage
+ template: ceph_cluster_physical_capacity_utilization
+ on: ceph.cluster_physical_capacity_utilization
class: Utilization
type: Storage
component: Ceph
- calc: $used * 100 / ($used + $avail)
+ calc: $utilization
units: %
every: 1m
- warn: $this > (($status >= $WARNING ) ? (85) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
- delay: down 5m multiplier 1.2 max 1h
- summary: Ceph cluster disk space utilization
- info: Ceph cluster disk space utilization
- to: sysadmin
+ warn: $this > (($status >= $WARNING ) ? (85) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 5m multiplier 1.2 max 1h
+ summary: Ceph cluster ${label:fsid} disk space utilization
+ info: Ceph cluster ${label:fsid} disk space utilization
+ to: sysadmin
diff --git a/src/health/health.d/disks.conf b/src/health/health.d/disks.conf
index fe96837fb..d8176a6be 100644
--- a/src/health/health.d/disks.conf
+++ b/src/health/health.d/disks.conf
@@ -12,24 +12,22 @@
class: Utilization
type: System
component: Disk
- host labels: _os=linux freebsd
-chart labels: mount_point=!/dev !/dev/* !/run !/run/* *
- calc: $used * 100 / ($avail + $used)
- units: %
- every: 1m
- warn: $this > (($status >= $WARNING ) ? (80) : (90))
- crit: ($this > (($status == $CRITICAL) ? (90) : (98))) && $avail < 5
- delay: up 1m down 15m multiplier 1.5 max 1h
- summary: Disk ${label:mount_point} space usage
- info: Total space utilization of disk ${label:mount_point}
- to: sysadmin
+chart labels: mount_point=!/dev !/dev/* !/run !/run/* !HarddiskVolume* *
+ calc: $used * 100 / ($avail + $used)
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING ) ? (80) : (90))
+ crit: ($this > (($status == $CRITICAL) ? (90) : (98))) && $avail < 5
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ summary: Disk ${label:mount_point} space usage
+ info: Total space utilization of disk ${label:mount_point}
+ to: sysadmin
template: disk_inode_usage
on: disk.inodes
class: Utilization
type: System
component: Disk
- host labels: _os=linux freebsd
chart labels: mount_point=!/dev !/dev/* !/run !/run/* *
calc: $used * 100 / ($avail + $used)
units: %
@@ -55,7 +53,6 @@ chart labels: mount_point=!/dev !/dev/* !/run !/run/* *
template: disk_fill_rate
on: disk.space
-host labels: _os=linux freebsd
lookup: min -10m at -50m unaligned of avail
calc: ($this - $avail) / (($now - $after) / 3600)
every: 1m
@@ -67,7 +64,6 @@ host labels: _os=linux freebsd
template: out_of_disk_space_time
on: disk.space
-host labels: _os=linux freebsd
calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
units: hours
every: 10s
@@ -92,7 +88,6 @@ host labels: _os=linux freebsd
template: disk_inode_rate
on: disk.inodes
-host labels: _os=linux freebsd
lookup: min -10m at -50m unaligned of avail
calc: ($this - $avail) / (($now - $after) / 3600)
every: 1m
@@ -105,7 +100,6 @@ host labels: _os=linux freebsd
template: out_of_disk_inodes_time
on: disk.inodes
-host labels: _os=linux freebsd
calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf)
units: hours
every: 10s
@@ -129,7 +123,6 @@ host labels: _os=linux freebsd
class: Utilization
type: System
component: Disk
-host labels: _os=linux freebsd
lookup: average -10m unaligned
units: %
every: 1m
@@ -150,7 +143,6 @@ host labels: _os=linux freebsd
class: Latency
type: System
component: Disk
-host labels: _os=linux freebsd
lookup: average -10m unaligned
units: ms
every: 1m
diff --git a/src/health/health.d/net.conf b/src/health/health.d/net.conf
index 448a3733d..609741aca 100644
--- a/src/health/health.d/net.conf
+++ b/src/health/health.d/net.conf
@@ -19,7 +19,7 @@ component: Network
class: Workload
type: System
component: Network
-host labels: _os=linux
+host labels: _os=linux windows
lookup: average -1m unaligned absolute of received
calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan )
units: %
@@ -35,7 +35,7 @@ host labels: _os=linux
class: Workload
type: System
component: Network
-host labels: _os=linux
+host labels: _os=linux windows
lookup: average -1m unaligned absolute of sent
calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan )
units: %
@@ -214,7 +214,6 @@ host labels: _os=linux
class: Workload
type: System
component: Network
-host labels: _os=linux freebsd
lookup: average -1m unaligned of received
units: packets
every: 10s
@@ -225,7 +224,6 @@ host labels: _os=linux freebsd
class: Workload
type: System
component: Network
-host labels: _os=linux freebsd
lookup: average -10s unaligned of received
calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
every: 10s
@@ -237,3 +235,21 @@ host labels: _os=linux freebsd
info: Ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
compared to the rate over the last minute
to: silent
+
+# -----------------------------------------------------------------------------
+# output queue length
+
+ template: network_interface_output_queue_length
+ on: net.queue_length
+ class: Errors
+ type: System
+ component: Network
+host labels: _os=windows
+ units: packets
+ every: 10s
+ warn: $length > 2
+ delay: up 1m down 1m multiplier 1.5 max 1h
+ summary: System network interface ${label:device} output queue length
+ info: The Output Queue Length on interface ${label:device} should be zero, otherwise there are delays and bottlenecks.
+ to: silent
+
diff --git a/src/health/health.d/vernemq.conf b/src/health/health.d/vernemq.conf
index 6ea9f99dc..df7f68fc4 100644
--- a/src/health/health.d/vernemq.conf
+++ b/src/health/health.d/vernemq.conf
@@ -2,67 +2,67 @@
# Socket errors
template: vernemq_socket_errors
- on: vernemq.socket_errors
+ on: vernemq.node_socket_errors
class: Errors
type: Messaging
component: VerneMQ
- lookup: sum -1m unaligned absolute of socket_error
+ lookup: sum -1m unaligned
units: errors
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ socket errors
- info: Number of socket errors in the last minute
+ summary: Node ${label:node} socket errors
+ info: Node ${label:node} socket errors in the last minute
to: sysadmin
# Queues dropped/expired/unhandled PUBLISH messages
template: vernemq_queue_message_drop
- on: vernemq.queue_undelivered_messages
+ on: vernemq.node_queue_undelivered_messages
class: Errors
type: Messaging
component: VerneMQ
- lookup: average -1m unaligned absolute of queue_message_drop
+ lookup: average -1m unaligned absolute of dropped
units: dropped messages
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ dropped messages
- info: Number of dropped messages due to full queues in the last minute
+ summary: Node ${label:node} dropped messages
+ info: Node ${label:node} dropped messages due to full queues in the last minute
to: sysadmin
template: vernemq_queue_message_expired
- on: vernemq.queue_undelivered_messages
+ on: vernemq.node_queue_undelivered_messages
class: Latency
type: Messaging
component: VerneMQ
- lookup: average -1m unaligned absolute of queue_message_expired
+ lookup: average -1m unaligned absolute of expired
units: expired messages
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ expired messages
- info: number of messages which expired before delivery in the last minute
+ summary: Node ${label:node} expired messages
+ info: Node ${label:node} expired before delivery messages in the last minute
to: sysadmin
template: vernemq_queue_message_unhandled
- on: vernemq.queue_undelivered_messages
+ on: vernemq.node_queue_undelivered_messages
class: Latency
type: Messaging
component: VerneMQ
- lookup: average -1m unaligned absolute of queue_message_unhandled
+ lookup: average -1m unaligned absolute of unhandled
units: unhandled messages
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ unhandled messages
- info: Number of unhandled messages (connections with clean session=true) in the last minute
+ summary: Node ${label:node} unhandled messages
+ info: Node ${label:node} unhandled messages in the last minute
to: sysadmin
# Erlang VM
template: vernemq_average_scheduler_utilization
- on: vernemq.average_scheduler_utilization
+ on: vernemq.node_average_scheduler_utilization
class: Utilization
type: Messaging
component: VerneMQ
@@ -72,14 +72,14 @@ component: VerneMQ
warn: $this > (($status >= $WARNING) ? (75) : (85))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
- summary: VerneMQ scheduler utilization
- info: Average scheduler utilization over the last 10 minutes
+ summary: Node ${label:node} scheduler utilization
+ info: Node ${label:node} scheduler utilization over the last 10 minutes
to: sysadmin
# Cluster communication and netsplits
template: vernemq_cluster_dropped
- on: vernemq.cluster_dropped
+ on: vernemq.node_cluster_dropped
class: Errors
type: Messaging
component: VerneMQ
@@ -88,74 +88,74 @@ component: VerneMQ
every: 1m
warn: $this > 0
delay: up 5m down 5m multiplier 1.5 max 1h
- summary: VerneMQ dropped traffic
- info: Amount of traffic dropped during communication with the cluster nodes in the last minute
+ summary: Node ${label:node} dropped cluster traffic
+ info: Node ${label:node} traffic dropped during communication with the cluster nodes in the last minute
to: sysadmin
template: vernemq_netsplits
- on: vernemq.netsplits
+ on: vernemq.node_netsplits
class: Workload
type: Messaging
component: VerneMQ
- lookup: sum -1m unaligned absolute of netsplit_detected
+ lookup: sum -1m unaligned absolute of detected
units: netsplits
every: 10s
warn: $this > 0
delay: down 5m multiplier 1.5 max 2h
- summary: VerneMQ netsplits
- info: Number of detected netsplits (split brain situation) in the last minute
+ summary: Node ${label:node} detected netsplits
+ info: Node ${label:node} detected netsplits (split brain) in the last minute
to: sysadmin
# Unsuccessful CONNACK
template: vernemq_mqtt_connack_sent_reason_unsuccessful
- on: vernemq.mqtt_connack_sent_reason
+ on: vernemq.node_mqtt_connack_sent_by_reason_code
class: Errors
type: Messaging
component: VerneMQ
- lookup: average -1m unaligned absolute match-names of !success,*
+ lookup: average -1m unaligned absolute of !success,*
units: packets
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ unsuccessful CONNACK
- info: Number of sent unsuccessful v3/v5 CONNACK packets in the last minute
+ summary: Node ${label:node} unsuccessful sent CONNACK
+ info: Node ${label:node} unsuccessful sent v5 CONNACK packets in the last minute
to: sysadmin
# Not normal DISCONNECT
template: vernemq_mqtt_disconnect_received_reason_not_normal
- on: vernemq.mqtt_disconnect_received_reason
+ on: vernemq.node_mqtt_disconnect_received_by_reason_code
class: Workload
type: Messaging
component: VerneMQ
- lookup: average -1m unaligned absolute match-names of !normal_disconnect,*
+ lookup: average -1m unaligned absolute of !normal_disconnect,*
units: packets
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ received not normal DISCONNECT
- info: Number of received not normal v5 DISCONNECT packets in the last minute
+ summary: Node ${label:node} received not normal DISCONNECT
+ info: Node ${label:node} received not normal v5 DISCONNECT packets in the last minute
to: sysadmin
template: vernemq_mqtt_disconnect_sent_reason_not_normal
- on: vernemq.mqtt_disconnect_sent_reason
+ on: vernemq.node_mqtt_disconnect_sent_by_reason_code
class: Errors
type: Messaging
component: VerneMQ
- lookup: average -1m unaligned absolute match-names of !normal_disconnect,*
+ lookup: average -1m unaligned absolute of !normal_disconnect,*
units: packets
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ sent not normal DISCONNECT
- info: Number of sent not normal v5 DISCONNECT packets in the last minute
+ summary: Node ${label:node} sent not normal DISCONNECT
+ info: Node ${label:node} sent not normal v5 DISCONNECT packets in the last minute
to: sysadmin
# SUBSCRIBE errors and unauthorized attempts
template: vernemq_mqtt_subscribe_error
- on: vernemq.mqtt_subscribe_error
+ on: vernemq.node_mqtt_subscribe_error
class: Errors
type: Messaging
component: VerneMQ
@@ -164,12 +164,12 @@ component: VerneMQ
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ failed SUBSCRIBE
- info: Number of failed v3/v5 SUBSCRIBE operations in the last minute
+ summary: Node ${label:node} mqtt v${label:mqtt_version} failed SUBSCRIBE
+ info: Node ${label:node} mqtt v${label:mqtt_version} failed SUBSCRIBE operations in the last minute
to: sysadmin
template: vernemq_mqtt_subscribe_auth_error
- on: vernemq.mqtt_subscribe_auth_error
+ on: vernemq.node_mqtt_subscribe_auth_error
class: Workload
type: Messaging
component: VerneMQ
@@ -178,14 +178,14 @@ component: VerneMQ
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ unauthorized SUBSCRIBE
- info: number of unauthorized v3/v5 SUBSCRIBE attempts in the last minute
+ summary: Node ${label:node} mqtt v${label:mqtt_version} unauthorized SUBSCRIBE
+ info: Node ${label:node} mqtt v${label:mqtt_version} unauthorized SUBSCRIBE attempts in the last minute
to: sysadmin
# UNSUBSCRIBE errors
template: vernemq_mqtt_unsubscribe_error
- on: vernemq.mqtt_unsubscribe_error
+ on: vernemq.node_mqtt_unsubscribe_error
class: Errors
type: Messaging
component: VerneMQ
@@ -194,14 +194,14 @@ component: VerneMQ
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ failed UNSUBSCRIBE
- info: Number of failed v3/v5 UNSUBSCRIBE operations in the last minute
+ summary: Node ${label:node} mqtt v${label:mqtt_version} failed UNSUBSCRIBE
+ info: Node ${label:node} mqtt v${label:mqtt_version} failed UNSUBSCRIBE operations in the last minute
to: sysadmin
# PUBLISH errors and unauthorized attempts
template: vernemq_mqtt_publish_errors
- on: vernemq.mqtt_publish_errors
+ on: vernemq.node_mqtt_publish_errors
class: Errors
type: Messaging
component: VerneMQ
@@ -210,12 +210,12 @@ component: VerneMQ
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ failed PUBLISH
- info: Number of failed v3/v5 PUBLISH operations in the last minute
+ summary: Node ${label:node} mqtt v${label:mqtt_version} failed PUBLISH
+ info: Node ${label:node} mqtt v${label:mqtt_version} failed PUBLISH operations in the last minute
to: sysadmin
template: vernemq_mqtt_publish_auth_errors
- on: vernemq.mqtt_publish_auth_errors
+ on: vernemq.node_mqtt_publish_auth_errors
class: Workload
type: Messaging
component: VerneMQ
@@ -224,42 +224,42 @@ component: VerneMQ
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ unauthorized PUBLISH
- info: Number of unauthorized v3/v5 PUBLISH attempts in the last minute
+ summary: Node ${label:node} mqtt v${label:mqtt_version} unauthorized PUBLISH
+ info: Node ${label:node} mqtt v${label:mqtt_version} unauthorized PUBLISH attempts in the last minute
to: sysadmin
# Unsuccessful and unexpected PUBACK
template: vernemq_mqtt_puback_received_reason_unsuccessful
- on: vernemq.mqtt_puback_received_reason
+ on: vernemq.node_mqtt_puback_received_by_reason_code
class: Errors
type: Messaging
component: VerneMQ
- lookup: average -1m unaligned absolute match-names of !success,*
+ lookup: average -1m unaligned absolute of !success,*
units: packets
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ unsuccessful received PUBACK
- info: Number of received unsuccessful v5 PUBACK packets in the last minute
+ summary: Node ${label:node} mqtt v5 received unsuccessful PUBACK
+ info: Node ${label:node} mqtt v5 received unsuccessful PUBACK packets in the last minute
to: sysadmin
template: vernemq_mqtt_puback_sent_reason_unsuccessful
- on: vernemq.mqtt_puback_sent_reason
+ on: vernemq.node_mqtt_puback_sent_by_reason_code
class: Errors
type: Messaging
component: VerneMQ
- lookup: average -1m unaligned absolute match-names of !success,*
+ lookup: average -1m unaligned absolute of !success,*
units: packets
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ unsuccessful sent PUBACK
- info: Number of sent unsuccessful v5 PUBACK packets in the last minute
+ summary: Node ${label:node} mqtt v5 unsuccessful sent PUBACK
+ info: Node ${label:node} mqtt v5 unsuccessful sent PUBACK packets in the last minute
to: sysadmin
template: vernemq_mqtt_puback_unexpected
- on: vernemq.mqtt_puback_invalid_error
+ on: vernemq.node_mqtt_puback_invalid_error
class: Workload
type: Messaging
component: VerneMQ
@@ -268,42 +268,42 @@ component: VerneMQ
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ unnexpected recieved PUBACK
- info: Number of received unexpected v3/v5 PUBACK packets in the last minute
+ summary: Node ${label:node} mqtt v${label:mqtt_version} recieved unnexpected PUBACK
+ info: Node ${label:node} mqtt v${label:mqtt_version} received unexpected PUBACK messages in the last minute
to: sysadmin
# Unsuccessful and unexpected PUBREC
template: vernemq_mqtt_pubrec_received_reason_unsuccessful
- on: vernemq.mqtt_pubrec_received_reason
+ on: vernemq.node_mqtt_pubrec_received_by_reason_code
class: Errors
type: Messaging
component: VerneMQ
- lookup: average -1m unaligned absolute match-names of !success,*
+ lookup: average -1m unaligned absolute of !success,*
units: packets
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ unsuccessful received PUBREC
- info: Number of received unsuccessful v5 PUBREC packets in the last minute
+ summary: Node ${label:node} mqtt v5 received unsuccessful PUBREC
+ info: Node ${label:node} mqtt v5 received unsuccessful PUBREC packets in the last minute
to: sysadmin
template: vernemq_mqtt_pubrec_sent_reason_unsuccessful
- on: vernemq.mqtt_pubrec_sent_reason
+ on: vernemq.node_mqtt_pubrec_sent_by_reason_code
class: Errors
type: Messaging
component: VerneMQ
- lookup: average -1m unaligned absolute match-names of !success,*
+ lookup: average -1m unaligned absolute of !success,*
units: packets
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ unsuccessful sent PUBREC
- info: Number of sent unsuccessful v5 PUBREC packets in the last minute
+ summary: Node ${label:node} mqtt v5 unsuccessful sent PUBREC
+ info: Node ${label:node} mqtt v5 unsuccessful sent PUBREC packets in the last minute
to: sysadmin
template: vernemq_mqtt_pubrec_invalid_error
- on: vernemq.mqtt_pubrec_invalid_error
+ on: vernemq.node_mqtt_pubrec_invalid_error
class: Workload
type: Messaging
component: VerneMQ
@@ -312,72 +312,72 @@ component: VerneMQ
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ invalid received PUBREC
- info: Number of received invalid v3 PUBREC packets in the last minute
+ summary: Node ${label:node} mqtt v${label:mqtt_version} received invalid PUBREC
+ info: Node ${label:node} mqtt v${label:mqtt_version} received invalid PUBREC packets in the last minute
to: sysadmin
# Unsuccessful PUBREL
template: vernemq_mqtt_pubrel_received_reason_unsuccessful
- on: vernemq.mqtt_pubrel_received_reason
+ on: vernemq.node_mqtt_pubrel_received_by_reason_code
class: Errors
type: Messaging
component: VerneMQ
- lookup: average -1m unaligned absolute match-names of !success,*
+ lookup: average -1m unaligned absolute of !success,*
units: packets
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ unsuccessful received PUBREL
- info: Number of received unsuccessful v5 PUBREL packets in the last minute
+ summary: Node ${label:node} mqtt v5 received unsuccessful PUBREL
+ info: Node ${label:node} mqtt v5 received unsuccessful PUBREL packets in the last minute
to: sysadmin
template: vernemq_mqtt_pubrel_sent_reason_unsuccessful
- on: vernemq.mqtt_pubrel_sent_reason
+ on: vernemq.node_mqtt_pubrel_sent_by_reason_code
class: Errors
type: Messaging
component: VerneMQ
- lookup: average -1m unaligned absolute match-names of !success,*
+ lookup: average -1m unaligned absolute of !success,*
units: packets
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ unsuccessful sent PUBREL
- info: number of sent unsuccessful v5 PUBREL packets in the last minute
+ summary: Node ${label:node} mqtt v5 unsuccessful sent PUBREL
+ info: Node ${label:node} mqtt v5 unsuccessful sent PUBREL packets in the last minute
to: sysadmin
# Unsuccessful and unexpected PUBCOMP
template: vernemq_mqtt_pubcomp_received_reason_unsuccessful
- on: vernemq.mqtt_pubcomp_received_reason
+ on: vernemq.node_mqtt_pubcomp_received_by_reason_code
class: Errors
type: Messaging
component: VerneMQ
- lookup: average -1m unaligned absolute match-names of !success,*
+ lookup: average -1m unaligned absolute of !success,*
units: packets
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ unsuccessful received PUBCOMP
- info: Number of received unsuccessful v5 PUBCOMP packets in the last minute
+ summary: Node ${label:node} mqtt v5 received unsuccessful PUBCOMP
+ info: Node ${label:node} mqtt v5 received unsuccessful PUBCOMP packets in the last minute
to: sysadmin
template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful
- on: vernemq.mqtt_pubcomp_sent_reason
+ on: vernemq.node_mqtt_pubcomp_sent_by_reason_code
class: Errors
type: Messaging
component: VerneMQ
- lookup: average -1m unaligned absolute match-names of !success,*
+ lookup: average -1m unaligned absolute of !success,*
units: packets
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ unsuccessful sent PUBCOMP
- info: number of sent unsuccessful v5 PUBCOMP packets in the last minute
+ summary: Node ${label:node} mqtt v5 unsuccessful sent PUBCOMP
+ info: Node ${label:node} mqtt v5 unsuccessful sent PUBCOMP packets in the last minute
to: sysadmin
template: vernemq_mqtt_pubcomp_unexpected
- on: vernemq.mqtt_pubcomp_invalid_error
+ on: vernemq.node_mqtt_pubcomp_invalid_error
class: Workload
type: Messaging
component: VerneMQ
@@ -386,6 +386,6 @@ component: VerneMQ
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: up 2m down 5m multiplier 1.5 max 2h
- summary: VerneMQ unexpected received PUBCOMP
- info: number of received unexpected v3/v5 PUBCOMP packets in the last minute
+ summary: Node ${label:node} mqtt v${label:mqtt_version} received unexpected PUBCOMP
+ info: Node ${label:node} mqtt v${label:mqtt_version} received unexpected PUBCOMP packets in the last minute
to: sysadmin