diff options
Diffstat (limited to 'src/health/health.d')
-rw-r--r-- | src/health/health.d/anomalies.conf | 25 | ||||
-rw-r--r-- | src/health/health.d/apcupsd.conf | 48 | ||||
-rw-r--r-- | src/health/health.d/boinc.conf | 6 | ||||
-rw-r--r-- | src/health/health.d/ceph.conf | 18 | ||||
-rw-r--r-- | src/health/health.d/disks.conf | 28 | ||||
-rw-r--r-- | src/health/health.d/net.conf | 24 | ||||
-rw-r--r-- | src/health/health.d/vernemq.conf | 188 |
7 files changed, 159 insertions, 178 deletions
diff --git a/src/health/health.d/anomalies.conf b/src/health/health.d/anomalies.conf deleted file mode 100644 index 80d63bb8d..000000000 --- a/src/health/health.d/anomalies.conf +++ /dev/null @@ -1,25 +0,0 @@ -## raise a warning alarm if an anomaly probability is consistently above 50% - -## "foreach" was removed, these alarms don't work anymore - -# template: anomalies_anomaly_probabilities -# on: anomalies.probability -# class: Errors -# type: Netdata -#component: ML -# lookup: average -2m foreach * -# every: 1m -# warn: $this > 50 -# info: average anomaly probability over the last 2 minutes - -# raise a warning alarm if an anomaly flag is consistently firing - -# template: anomalies_anomaly_flags -# on: anomalies.anomaly -# class: Errors -# type: Netdata -#component: ML -# lookup: sum -2m foreach * -# every: 1m -# warn: $this > 10 -# info: number of anomalies in the last 2 minutes diff --git a/src/health/health.d/apcupsd.conf b/src/health/health.d/apcupsd.conf index 5fd7aa112..58d3b214b 100644 --- a/src/health/health.d/apcupsd.conf +++ b/src/health/health.d/apcupsd.conf @@ -1,11 +1,11 @@ # you can disable an alarm notification by setting the 'to' line to: silent - template: apcupsd_10min_ups_load - on: apcupsd.load + template: apcupsd_ups_load_capacity + on: apcupsd.ups_load_capacity_utilization class: Utilization type: Power Supply -component: UPS - lookup: average -10m unaligned of percentage +component: UPS device + lookup: average -10m unaligned of load units: % every: 1m warn: $this > (($status >= $WARNING) ? (70) : (80)) @@ -14,13 +14,11 @@ component: UPS info: APC UPS average load over the last 10 minutes to: sitemgr -# Discussion in https://github.com/netdata/netdata/pull/3928: -# Fire the alarm as soon as it's going on battery (99% charge) and clear only when full. - template: apcupsd_ups_charge - on: apcupsd.charge + template: apcupsd_ups_battery_charge + on: apcupsd.ups_battery_charge class: Errors type: Power Supply -component: UPS +component: UPS device lookup: average -60s unaligned of charge units: % every: 60s @@ -32,7 +30,7 @@ component: UPS to: sitemgr template: apcupsd_last_collected_secs - on: apcupsd.load + on: apcupsd.ups_status class: Latency type: Power Supply component: UPS device @@ -47,21 +45,21 @@ component: UPS device #Send out a warning when SELFTEST code is BT or NG. Code descriptions can be found at: #http://www.apcupsd.org/manual/#:~:text=or%20N/A.-,SELFTEST,-The%20results%20of - template: apcupsd_selftest_warning - on: apcupsd.selftest + template: apcupsd_ups_selftest_warning + on: apcupsd.ups_selftest lookup: max -1s unaligned match-names of BT,NG units: status every: 10s warn: $this == 1 delay: up 0 down 15m multiplier 1.5 max 1h - info: APC UPS self-test failed due to insufficient battery capacity or due to overload. + info: APC UPS self-test failed due to insufficient battery capacity or due to overload to: sitemgr #Send out a warning when STATUS code is ONBATT,OVERLOAD,LOWBATT,REPLACEBATT,NOBATT,COMMLOST #https://man.archlinux.org/man/apcaccess.8.en#:~:text=apcupsd%20was%20started-,STATUS,-%3A%20UPS%20status.%20One - template: apcupsd_status_onbatt - on: apcupsd.status + template: apcupsd_ups_status_onbatt + on: apcupsd.ups_status lookup: max -1s unaligned match-names of ONBATT units: status every: 10s @@ -70,8 +68,8 @@ component: UPS device info: APC UPS has switched to battery power because the input power has failed to: sitemgr - template: apcupsd_status_overload - on: apcupsd.status + template: apcupsd_ups_status_overload + on: apcupsd.ups_status lookup: max -1s unaligned match-names of OVERLOAD units: status every: 10s @@ -80,8 +78,8 @@ component: UPS device info: APC UPS is overloaded and cannot supply enough power to the load to: sitemgr - template: apcupsd_status_lowbatt - on: apcupsd.status + template: apcupsd_ups_status_lowbatt + on: apcupsd.ups_status lookup: max -1s unaligned match-names of LOWBATT units: status every: 10s @@ -90,8 +88,8 @@ component: UPS device info: APC UPS battery is low and needs to be recharged to: sitemgr - template: apcupsd_status_replacebatt - on: apcupsd.status + template: apcupsd_ups_status_replacebatt + on: apcupsd.ups_status lookup: max -1s unaligned match-names of REPLACEBATT units: status every: 10s @@ -100,8 +98,8 @@ component: UPS device info: APC UPS battery has reached the end of its lifespan and needs to be replaced to: sitemgr - template: apcupsd_status_nobatt - on: apcupsd.status + template: apcupsd_ups_status_nobatt + on: apcupsd.ups_status lookup: max -1s unaligned match-names of NOBATT units: status every: 10s @@ -110,8 +108,8 @@ component: UPS device info: APC UPS has no battery to: sitemgr - template: apcupsd_status_commlost - on: apcupsd.status + template: apcupsd_ups_status_commlost + on: apcupsd.ups_status lookup: max -1s unaligned match-names of COMMLOST units: status every: 10s diff --git a/src/health/health.d/boinc.conf b/src/health/health.d/boinc.conf index 6fd987de1..987d20212 100644 --- a/src/health/health.d/boinc.conf +++ b/src/health/health.d/boinc.conf @@ -2,11 +2,11 @@ # Warn on any compute errors encountered. template: boinc_compute_errors - on: boinc.states + on: boinc.tasks_per_state class: Errors type: Computing component: BOINC - lookup: average -10m unaligned of comperror + lookup: average -10m unaligned of compute_error units: tasks every: 1m warn: $this > 0 @@ -17,7 +17,7 @@ component: BOINC # Warn on lots of upload errors template: boinc_upload_errors - on: boinc.states + on: boinc.tasks_per_state class: Errors type: Computing component: BOINC diff --git a/src/health/health.d/ceph.conf b/src/health/health.d/ceph.conf index 44d351338..0048e2a7c 100644 --- a/src/health/health.d/ceph.conf +++ b/src/health/health.d/ceph.conf @@ -1,16 +1,16 @@ # low ceph disk available - template: ceph_cluster_space_usage - on: ceph.general_usage + template: ceph_cluster_physical_capacity_utilization + on: ceph.cluster_physical_capacity_utilization class: Utilization type: Storage component: Ceph - calc: $used * 100 / ($used + $avail) + calc: $utilization units: % every: 1m - warn: $this > (($status >= $WARNING ) ? (85) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 5m multiplier 1.2 max 1h - summary: Ceph cluster disk space utilization - info: Ceph cluster disk space utilization - to: sysadmin + warn: $this > (($status >= $WARNING ) ? (85) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 5m multiplier 1.2 max 1h + summary: Ceph cluster ${label:fsid} disk space utilization + info: Ceph cluster ${label:fsid} disk space utilization + to: sysadmin diff --git a/src/health/health.d/disks.conf b/src/health/health.d/disks.conf index fe96837fb..d8176a6be 100644 --- a/src/health/health.d/disks.conf +++ b/src/health/health.d/disks.conf @@ -12,24 +12,22 @@ class: Utilization type: System component: Disk - host labels: _os=linux freebsd -chart labels: mount_point=!/dev !/dev/* !/run !/run/* * - calc: $used * 100 / ($avail + $used) - units: % - every: 1m - warn: $this > (($status >= $WARNING ) ? (80) : (90)) - crit: ($this > (($status == $CRITICAL) ? (90) : (98))) && $avail < 5 - delay: up 1m down 15m multiplier 1.5 max 1h - summary: Disk ${label:mount_point} space usage - info: Total space utilization of disk ${label:mount_point} - to: sysadmin +chart labels: mount_point=!/dev !/dev/* !/run !/run/* !HarddiskVolume* * + calc: $used * 100 / ($avail + $used) + units: % + every: 1m + warn: $this > (($status >= $WARNING ) ? (80) : (90)) + crit: ($this > (($status == $CRITICAL) ? (90) : (98))) && $avail < 5 + delay: up 1m down 15m multiplier 1.5 max 1h + summary: Disk ${label:mount_point} space usage + info: Total space utilization of disk ${label:mount_point} + to: sysadmin template: disk_inode_usage on: disk.inodes class: Utilization type: System component: Disk - host labels: _os=linux freebsd chart labels: mount_point=!/dev !/dev/* !/run !/run/* * calc: $used * 100 / ($avail + $used) units: % @@ -55,7 +53,6 @@ chart labels: mount_point=!/dev !/dev/* !/run !/run/* * template: disk_fill_rate on: disk.space -host labels: _os=linux freebsd lookup: min -10m at -50m unaligned of avail calc: ($this - $avail) / (($now - $after) / 3600) every: 1m @@ -67,7 +64,6 @@ host labels: _os=linux freebsd template: out_of_disk_space_time on: disk.space -host labels: _os=linux freebsd calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) units: hours every: 10s @@ -92,7 +88,6 @@ host labels: _os=linux freebsd template: disk_inode_rate on: disk.inodes -host labels: _os=linux freebsd lookup: min -10m at -50m unaligned of avail calc: ($this - $avail) / (($now - $after) / 3600) every: 1m @@ -105,7 +100,6 @@ host labels: _os=linux freebsd template: out_of_disk_inodes_time on: disk.inodes -host labels: _os=linux freebsd calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf) units: hours every: 10s @@ -129,7 +123,6 @@ host labels: _os=linux freebsd class: Utilization type: System component: Disk -host labels: _os=linux freebsd lookup: average -10m unaligned units: % every: 1m @@ -150,7 +143,6 @@ host labels: _os=linux freebsd class: Latency type: System component: Disk -host labels: _os=linux freebsd lookup: average -10m unaligned units: ms every: 1m diff --git a/src/health/health.d/net.conf b/src/health/health.d/net.conf index 448a3733d..609741aca 100644 --- a/src/health/health.d/net.conf +++ b/src/health/health.d/net.conf @@ -19,7 +19,7 @@ component: Network class: Workload type: System component: Network -host labels: _os=linux +host labels: _os=linux windows lookup: average -1m unaligned absolute of received calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan ) units: % @@ -35,7 +35,7 @@ host labels: _os=linux class: Workload type: System component: Network -host labels: _os=linux +host labels: _os=linux windows lookup: average -1m unaligned absolute of sent calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan ) units: % @@ -214,7 +214,6 @@ host labels: _os=linux class: Workload type: System component: Network -host labels: _os=linux freebsd lookup: average -1m unaligned of received units: packets every: 10s @@ -225,7 +224,6 @@ host labels: _os=linux freebsd class: Workload type: System component: Network -host labels: _os=linux freebsd lookup: average -10s unaligned of received calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) every: 10s @@ -237,3 +235,21 @@ host labels: _os=linux freebsd info: Ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ compared to the rate over the last minute to: silent + +# ----------------------------------------------------------------------------- +# output queue length + + template: network_interface_output_queue_length + on: net.queue_length + class: Errors + type: System + component: Network +host labels: _os=windows + units: packets + every: 10s + warn: $length > 2 + delay: up 1m down 1m multiplier 1.5 max 1h + summary: System network interface ${label:device} output queue length + info: The Output Queue Length on interface ${label:device} should be zero, otherwise there are delays and bottlenecks. + to: silent + diff --git a/src/health/health.d/vernemq.conf b/src/health/health.d/vernemq.conf index 6ea9f99dc..df7f68fc4 100644 --- a/src/health/health.d/vernemq.conf +++ b/src/health/health.d/vernemq.conf @@ -2,67 +2,67 @@ # Socket errors template: vernemq_socket_errors - on: vernemq.socket_errors + on: vernemq.node_socket_errors class: Errors type: Messaging component: VerneMQ - lookup: sum -1m unaligned absolute of socket_error + lookup: sum -1m unaligned units: errors every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ socket errors - info: Number of socket errors in the last minute + summary: Node ${label:node} socket errors + info: Node ${label:node} socket errors in the last minute to: sysadmin # Queues dropped/expired/unhandled PUBLISH messages template: vernemq_queue_message_drop - on: vernemq.queue_undelivered_messages + on: vernemq.node_queue_undelivered_messages class: Errors type: Messaging component: VerneMQ - lookup: average -1m unaligned absolute of queue_message_drop + lookup: average -1m unaligned absolute of dropped units: dropped messages every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ dropped messages - info: Number of dropped messages due to full queues in the last minute + summary: Node ${label:node} dropped messages + info: Node ${label:node} dropped messages due to full queues in the last minute to: sysadmin template: vernemq_queue_message_expired - on: vernemq.queue_undelivered_messages + on: vernemq.node_queue_undelivered_messages class: Latency type: Messaging component: VerneMQ - lookup: average -1m unaligned absolute of queue_message_expired + lookup: average -1m unaligned absolute of expired units: expired messages every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ expired messages - info: number of messages which expired before delivery in the last minute + summary: Node ${label:node} expired messages + info: Node ${label:node} expired before delivery messages in the last minute to: sysadmin template: vernemq_queue_message_unhandled - on: vernemq.queue_undelivered_messages + on: vernemq.node_queue_undelivered_messages class: Latency type: Messaging component: VerneMQ - lookup: average -1m unaligned absolute of queue_message_unhandled + lookup: average -1m unaligned absolute of unhandled units: unhandled messages every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unhandled messages - info: Number of unhandled messages (connections with clean session=true) in the last minute + summary: Node ${label:node} unhandled messages + info: Node ${label:node} unhandled messages in the last minute to: sysadmin # Erlang VM template: vernemq_average_scheduler_utilization - on: vernemq.average_scheduler_utilization + on: vernemq.node_average_scheduler_utilization class: Utilization type: Messaging component: VerneMQ @@ -72,14 +72,14 @@ component: VerneMQ warn: $this > (($status >= $WARNING) ? (75) : (85)) crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h - summary: VerneMQ scheduler utilization - info: Average scheduler utilization over the last 10 minutes + summary: Node ${label:node} scheduler utilization + info: Node ${label:node} scheduler utilization over the last 10 minutes to: sysadmin # Cluster communication and netsplits template: vernemq_cluster_dropped - on: vernemq.cluster_dropped + on: vernemq.node_cluster_dropped class: Errors type: Messaging component: VerneMQ @@ -88,74 +88,74 @@ component: VerneMQ every: 1m warn: $this > 0 delay: up 5m down 5m multiplier 1.5 max 1h - summary: VerneMQ dropped traffic - info: Amount of traffic dropped during communication with the cluster nodes in the last minute + summary: Node ${label:node} dropped cluster traffic + info: Node ${label:node} traffic dropped during communication with the cluster nodes in the last minute to: sysadmin template: vernemq_netsplits - on: vernemq.netsplits + on: vernemq.node_netsplits class: Workload type: Messaging component: VerneMQ - lookup: sum -1m unaligned absolute of netsplit_detected + lookup: sum -1m unaligned absolute of detected units: netsplits every: 10s warn: $this > 0 delay: down 5m multiplier 1.5 max 2h - summary: VerneMQ netsplits - info: Number of detected netsplits (split brain situation) in the last minute + summary: Node ${label:node} detected netsplits + info: Node ${label:node} detected netsplits (split brain) in the last minute to: sysadmin # Unsuccessful CONNACK template: vernemq_mqtt_connack_sent_reason_unsuccessful - on: vernemq.mqtt_connack_sent_reason + on: vernemq.node_mqtt_connack_sent_by_reason_code class: Errors type: Messaging component: VerneMQ - lookup: average -1m unaligned absolute match-names of !success,* + lookup: average -1m unaligned absolute of !success,* units: packets every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unsuccessful CONNACK - info: Number of sent unsuccessful v3/v5 CONNACK packets in the last minute + summary: Node ${label:node} unsuccessful sent CONNACK + info: Node ${label:node} unsuccessful sent v5 CONNACK packets in the last minute to: sysadmin # Not normal DISCONNECT template: vernemq_mqtt_disconnect_received_reason_not_normal - on: vernemq.mqtt_disconnect_received_reason + on: vernemq.node_mqtt_disconnect_received_by_reason_code class: Workload type: Messaging component: VerneMQ - lookup: average -1m unaligned absolute match-names of !normal_disconnect,* + lookup: average -1m unaligned absolute of !normal_disconnect,* units: packets every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ received not normal DISCONNECT - info: Number of received not normal v5 DISCONNECT packets in the last minute + summary: Node ${label:node} received not normal DISCONNECT + info: Node ${label:node} received not normal v5 DISCONNECT packets in the last minute to: sysadmin template: vernemq_mqtt_disconnect_sent_reason_not_normal - on: vernemq.mqtt_disconnect_sent_reason + on: vernemq.node_mqtt_disconnect_sent_by_reason_code class: Errors type: Messaging component: VerneMQ - lookup: average -1m unaligned absolute match-names of !normal_disconnect,* + lookup: average -1m unaligned absolute of !normal_disconnect,* units: packets every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ sent not normal DISCONNECT - info: Number of sent not normal v5 DISCONNECT packets in the last minute + summary: Node ${label:node} sent not normal DISCONNECT + info: Node ${label:node} sent not normal v5 DISCONNECT packets in the last minute to: sysadmin # SUBSCRIBE errors and unauthorized attempts template: vernemq_mqtt_subscribe_error - on: vernemq.mqtt_subscribe_error + on: vernemq.node_mqtt_subscribe_error class: Errors type: Messaging component: VerneMQ @@ -164,12 +164,12 @@ component: VerneMQ every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ failed SUBSCRIBE - info: Number of failed v3/v5 SUBSCRIBE operations in the last minute + summary: Node ${label:node} mqtt v${label:mqtt_version} failed SUBSCRIBE + info: Node ${label:node} mqtt v${label:mqtt_version} failed SUBSCRIBE operations in the last minute to: sysadmin template: vernemq_mqtt_subscribe_auth_error - on: vernemq.mqtt_subscribe_auth_error + on: vernemq.node_mqtt_subscribe_auth_error class: Workload type: Messaging component: VerneMQ @@ -178,14 +178,14 @@ component: VerneMQ every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unauthorized SUBSCRIBE - info: number of unauthorized v3/v5 SUBSCRIBE attempts in the last minute + summary: Node ${label:node} mqtt v${label:mqtt_version} unauthorized SUBSCRIBE + info: Node ${label:node} mqtt v${label:mqtt_version} unauthorized SUBSCRIBE attempts in the last minute to: sysadmin # UNSUBSCRIBE errors template: vernemq_mqtt_unsubscribe_error - on: vernemq.mqtt_unsubscribe_error + on: vernemq.node_mqtt_unsubscribe_error class: Errors type: Messaging component: VerneMQ @@ -194,14 +194,14 @@ component: VerneMQ every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ failed UNSUBSCRIBE - info: Number of failed v3/v5 UNSUBSCRIBE operations in the last minute + summary: Node ${label:node} mqtt v${label:mqtt_version} failed UNSUBSCRIBE + info: Node ${label:node} mqtt v${label:mqtt_version} failed UNSUBSCRIBE operations in the last minute to: sysadmin # PUBLISH errors and unauthorized attempts template: vernemq_mqtt_publish_errors - on: vernemq.mqtt_publish_errors + on: vernemq.node_mqtt_publish_errors class: Errors type: Messaging component: VerneMQ @@ -210,12 +210,12 @@ component: VerneMQ every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ failed PUBLISH - info: Number of failed v3/v5 PUBLISH operations in the last minute + summary: Node ${label:node} mqtt v${label:mqtt_version} failed PUBLISH + info: Node ${label:node} mqtt v${label:mqtt_version} failed PUBLISH operations in the last minute to: sysadmin template: vernemq_mqtt_publish_auth_errors - on: vernemq.mqtt_publish_auth_errors + on: vernemq.node_mqtt_publish_auth_errors class: Workload type: Messaging component: VerneMQ @@ -224,42 +224,42 @@ component: VerneMQ every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unauthorized PUBLISH - info: Number of unauthorized v3/v5 PUBLISH attempts in the last minute + summary: Node ${label:node} mqtt v${label:mqtt_version} unauthorized PUBLISH + info: Node ${label:node} mqtt v${label:mqtt_version} unauthorized PUBLISH attempts in the last minute to: sysadmin # Unsuccessful and unexpected PUBACK template: vernemq_mqtt_puback_received_reason_unsuccessful - on: vernemq.mqtt_puback_received_reason + on: vernemq.node_mqtt_puback_received_by_reason_code class: Errors type: Messaging component: VerneMQ - lookup: average -1m unaligned absolute match-names of !success,* + lookup: average -1m unaligned absolute of !success,* units: packets every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unsuccessful received PUBACK - info: Number of received unsuccessful v5 PUBACK packets in the last minute + summary: Node ${label:node} mqtt v5 received unsuccessful PUBACK + info: Node ${label:node} mqtt v5 received unsuccessful PUBACK packets in the last minute to: sysadmin template: vernemq_mqtt_puback_sent_reason_unsuccessful - on: vernemq.mqtt_puback_sent_reason + on: vernemq.node_mqtt_puback_sent_by_reason_code class: Errors type: Messaging component: VerneMQ - lookup: average -1m unaligned absolute match-names of !success,* + lookup: average -1m unaligned absolute of !success,* units: packets every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unsuccessful sent PUBACK - info: Number of sent unsuccessful v5 PUBACK packets in the last minute + summary: Node ${label:node} mqtt v5 unsuccessful sent PUBACK + info: Node ${label:node} mqtt v5 unsuccessful sent PUBACK packets in the last minute to: sysadmin template: vernemq_mqtt_puback_unexpected - on: vernemq.mqtt_puback_invalid_error + on: vernemq.node_mqtt_puback_invalid_error class: Workload type: Messaging component: VerneMQ @@ -268,42 +268,42 @@ component: VerneMQ every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unnexpected recieved PUBACK - info: Number of received unexpected v3/v5 PUBACK packets in the last minute + summary: Node ${label:node} mqtt v${label:mqtt_version} recieved unnexpected PUBACK + info: Node ${label:node} mqtt v${label:mqtt_version} received unexpected PUBACK messages in the last minute to: sysadmin # Unsuccessful and unexpected PUBREC template: vernemq_mqtt_pubrec_received_reason_unsuccessful - on: vernemq.mqtt_pubrec_received_reason + on: vernemq.node_mqtt_pubrec_received_by_reason_code class: Errors type: Messaging component: VerneMQ - lookup: average -1m unaligned absolute match-names of !success,* + lookup: average -1m unaligned absolute of !success,* units: packets every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unsuccessful received PUBREC - info: Number of received unsuccessful v5 PUBREC packets in the last minute + summary: Node ${label:node} mqtt v5 received unsuccessful PUBREC + info: Node ${label:node} mqtt v5 received unsuccessful PUBREC packets in the last minute to: sysadmin template: vernemq_mqtt_pubrec_sent_reason_unsuccessful - on: vernemq.mqtt_pubrec_sent_reason + on: vernemq.node_mqtt_pubrec_sent_by_reason_code class: Errors type: Messaging component: VerneMQ - lookup: average -1m unaligned absolute match-names of !success,* + lookup: average -1m unaligned absolute of !success,* units: packets every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unsuccessful sent PUBREC - info: Number of sent unsuccessful v5 PUBREC packets in the last minute + summary: Node ${label:node} mqtt v5 unsuccessful sent PUBREC + info: Node ${label:node} mqtt v5 unsuccessful sent PUBREC packets in the last minute to: sysadmin template: vernemq_mqtt_pubrec_invalid_error - on: vernemq.mqtt_pubrec_invalid_error + on: vernemq.node_mqtt_pubrec_invalid_error class: Workload type: Messaging component: VerneMQ @@ -312,72 +312,72 @@ component: VerneMQ every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ invalid received PUBREC - info: Number of received invalid v3 PUBREC packets in the last minute + summary: Node ${label:node} mqtt v${label:mqtt_version} received invalid PUBREC + info: Node ${label:node} mqtt v${label:mqtt_version} received invalid PUBREC packets in the last minute to: sysadmin # Unsuccessful PUBREL template: vernemq_mqtt_pubrel_received_reason_unsuccessful - on: vernemq.mqtt_pubrel_received_reason + on: vernemq.node_mqtt_pubrel_received_by_reason_code class: Errors type: Messaging component: VerneMQ - lookup: average -1m unaligned absolute match-names of !success,* + lookup: average -1m unaligned absolute of !success,* units: packets every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unsuccessful received PUBREL - info: Number of received unsuccessful v5 PUBREL packets in the last minute + summary: Node ${label:node} mqtt v5 received unsuccessful PUBREL + info: Node ${label:node} mqtt v5 received unsuccessful PUBREL packets in the last minute to: sysadmin template: vernemq_mqtt_pubrel_sent_reason_unsuccessful - on: vernemq.mqtt_pubrel_sent_reason + on: vernemq.node_mqtt_pubrel_sent_by_reason_code class: Errors type: Messaging component: VerneMQ - lookup: average -1m unaligned absolute match-names of !success,* + lookup: average -1m unaligned absolute of !success,* units: packets every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unsuccessful sent PUBREL - info: number of sent unsuccessful v5 PUBREL packets in the last minute + summary: Node ${label:node} mqtt v5 unsuccessful sent PUBREL + info: Node ${label:node} mqtt v5 unsuccessful sent PUBREL packets in the last minute to: sysadmin # Unsuccessful and unexpected PUBCOMP template: vernemq_mqtt_pubcomp_received_reason_unsuccessful - on: vernemq.mqtt_pubcomp_received_reason + on: vernemq.node_mqtt_pubcomp_received_by_reason_code class: Errors type: Messaging component: VerneMQ - lookup: average -1m unaligned absolute match-names of !success,* + lookup: average -1m unaligned absolute of !success,* units: packets every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unsuccessful received PUBCOMP - info: Number of received unsuccessful v5 PUBCOMP packets in the last minute + summary: Node ${label:node} mqtt v5 received unsuccessful PUBCOMP + info: Node ${label:node} mqtt v5 received unsuccessful PUBCOMP packets in the last minute to: sysadmin template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful - on: vernemq.mqtt_pubcomp_sent_reason + on: vernemq.node_mqtt_pubcomp_sent_by_reason_code class: Errors type: Messaging component: VerneMQ - lookup: average -1m unaligned absolute match-names of !success,* + lookup: average -1m unaligned absolute of !success,* units: packets every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unsuccessful sent PUBCOMP - info: number of sent unsuccessful v5 PUBCOMP packets in the last minute + summary: Node ${label:node} mqtt v5 unsuccessful sent PUBCOMP + info: Node ${label:node} mqtt v5 unsuccessful sent PUBCOMP packets in the last minute to: sysadmin template: vernemq_mqtt_pubcomp_unexpected - on: vernemq.mqtt_pubcomp_invalid_error + on: vernemq.node_mqtt_pubcomp_invalid_error class: Workload type: Messaging component: VerneMQ @@ -386,6 +386,6 @@ component: VerneMQ every: 1m warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unexpected received PUBCOMP - info: number of received unexpected v3/v5 PUBCOMP packets in the last minute + summary: Node ${label:node} mqtt v${label:mqtt_version} received unexpected PUBCOMP + info: Node ${label:node} mqtt v${label:mqtt_version} received unexpected PUBCOMP packets in the last minute to: sysadmin |