diff options
Diffstat (limited to 'health/health.d/vernemq.conf')
-rw-r--r-- | health/health.d/vernemq.conf | 597 |
1 files changed, 339 insertions, 258 deletions
diff --git a/health/health.d/vernemq.conf b/health/health.d/vernemq.conf index 9598dd39c..737147f38 100644 --- a/health/health.d/vernemq.conf +++ b/health/health.d/vernemq.conf @@ -1,300 +1,381 @@ # Availability -template: vernemq_last_collected_secs - on: vernemq.node_uptime - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin + template: vernemq_last_collected_secs + on: vernemq.node_uptime + class: Messaging +component: VerneMQ + type: Latency + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin # Socket errors -template: vernemq_socket_errors - on: vernemq.socket_errors - lookup: sum -1m unaligned absolute of socket_error - units: errors - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - info: number of socket errors in the last minute - to: sysadmin + template: vernemq_socket_errors + on: vernemq.socket_errors + class: Messaging +component: VerneMQ + type: Errors + lookup: sum -1m unaligned absolute of socket_error + units: errors + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of socket errors in the last minute + to: sysadmin # Queues dropped/expired/unhandled PUBLISH messages -template: vernemq_queue_message_drop - on: vernemq.queue_undelivered_messages - lookup: sum -1m unaligned absolute of queue_message_drop - units: dropped messages - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of dropped messaged due to full queues in the last minute - to: sysadmin - -template: vernemq_queue_message_expired - on: vernemq.queue_undelivered_messages - lookup: sum -1m unaligned absolute of queue_message_expired - units: expired messages - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (15)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of messages which expired before delivery in the last minute - to: sysadmin - -template: vernemq_queue_message_unhandled - on: vernemq.queue_undelivered_messages - lookup: sum -1m unaligned absolute of queue_message_unhandled - units: unhandled messages - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of unhandled messages (connections with clean session=true) in the last minute - to: sysadmin + template: vernemq_queue_message_drop + on: vernemq.queue_undelivered_messages + class: Messaging +component: VerneMQ + type: Errors + lookup: average -1m unaligned absolute of queue_message_drop + units: dropped messages + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of dropped messaged due to full queues in the last minute + to: sysadmin + + template: vernemq_queue_message_expired + on: vernemq.queue_undelivered_messages + class: Messaging +component: VerneMQ + type: Latency + lookup: average -1m unaligned absolute of queue_message_expired + units: expired messages + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of messages which expired before delivery in the last minute + to: sysadmin + + template: vernemq_queue_message_unhandled + on: vernemq.queue_undelivered_messages + class: Messaging +component: VerneMQ + type: Latency + lookup: average -1m unaligned absolute of queue_message_unhandled + units: unhandled messages + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of unhandled messages (connections with clean session=true) in the last minute + to: sysadmin # Erlang VM -template: vernemq_average_scheduler_utilization - on: vernemq.average_scheduler_utilization - lookup: average -10m unaligned - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (75) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - info: average scheduler utilization over the last 10 minutes - to: sysadmin + template: vernemq_average_scheduler_utilization + on: vernemq.average_scheduler_utilization + class: Messaging +component: VerneMQ + type: Utilization + lookup: average -10m unaligned + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + info: average scheduler utilization over the last 10 minutes + to: sysadmin # Cluster communication and netsplits -template: vernemq_cluster_dropped - on: vernemq.cluster_dropped - lookup: sum -1m unaligned - units: KiB - every: 1m - warn: $this > 0 - delay: up 5m down 5m multiplier 1.5 max 1h - info: amount of traffic dropped during communication with the cluster nodes in the last minute - to: sysadmin - -template: vernemq_netsplits - on: vernemq.netsplits - lookup: sum -1m unaligned absolute of netsplit_detected - units: netsplits - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: number of detected netsplits (split brain situation) in the last minute - to: sysadmin + template: vernemq_cluster_dropped + on: vernemq.cluster_dropped + class: Messaging +component: VerneMQ + type: Errors + lookup: sum -1m unaligned + units: KiB + every: 1m + warn: $this > 0 + delay: up 5m down 5m multiplier 1.5 max 1h + info: amount of traffic dropped during communication with the cluster nodes in the last minute + to: sysadmin + + template: vernemq_netsplits + on: vernemq.netsplits + class: Messaging +component: VerneMQ + type: Workload + lookup: sum -1m unaligned absolute of netsplit_detected + units: netsplits + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: number of detected netsplits (split brain situation) in the last minute + to: sysadmin # Unsuccessful CONNACK -template: vernemq_mqtt_connack_sent_reason_unsuccessful - on: vernemq.mqtt_connack_sent_reason - lookup: sum -1m unaligned absolute match-names of !success,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of sent unsuccessful v3/v5 CONNACK packets in the last minute - to: sysadmin + template: vernemq_mqtt_connack_sent_reason_unsuccessful + on: vernemq.mqtt_connack_sent_reason + class: Messaging +component: VerneMQ + type: Errors + lookup: average -1m unaligned absolute match-names of !success,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of sent unsuccessful v3/v5 CONNACK packets in the last minute + to: sysadmin # Not normal DISCONNECT -template: vernemq_mqtt_disconnect_received_reason_not_normal - on: vernemq.mqtt_disconnect_received_reason - lookup: sum -1m unaligned absolute match-names of !normal_disconnect,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of received not normal v5 DISCONNECT packets in the last minute - to: sysadmin - -template: vernemq_mqtt_disconnect_sent_reason_not_normal - on: vernemq.mqtt_disconnect_sent_reason - lookup: sum -1m unaligned absolute match-names of !normal_disconnect,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of sent not normal v5 DISCONNECT packets in the last minute - to: sysadmin + template: vernemq_mqtt_disconnect_received_reason_not_normal + on: vernemq.mqtt_disconnect_received_reason + class: Messaging +component: VerneMQ + type: Workload + lookup: average -1m unaligned absolute match-names of !normal_disconnect,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of received not normal v5 DISCONNECT packets in the last minute + to: sysadmin + + template: vernemq_mqtt_disconnect_sent_reason_not_normal + on: vernemq.mqtt_disconnect_sent_reason + class: Messaging +component: VerneMQ + type: Errors + lookup: average -1m unaligned absolute match-names of !normal_disconnect,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of sent not normal v5 DISCONNECT packets in the last minute + to: sysadmin # SUBSCRIBE errors and unauthorized attempts -template: vernemq_mqtt_subscribe_error - on: vernemq.mqtt_subscribe_error - lookup: sum -1m unaligned absolute - units: failed ops - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of failed v3/v5 SUBSCRIBE operations in the last minute - to: sysadmin - -template: vernemq_mqtt_subscribe_auth_error - on: vernemq.mqtt_subscribe_auth_error - lookup: sum -1m unaligned absolute - units: attempts - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of unauthorized v3/v5 SUBSCRIBE attempts in the last minute - to: sysadmin + template: vernemq_mqtt_subscribe_error + on: vernemq.mqtt_subscribe_error + class: Messaging +component: VerneMQ + type: Errors + lookup: average -1m unaligned absolute + units: failed ops + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of failed v3/v5 SUBSCRIBE operations in the last minute + to: sysadmin + + template: vernemq_mqtt_subscribe_auth_error + on: vernemq.mqtt_subscribe_auth_error + class: Messaging +component: VerneMQ + type: Workload + lookup: average -1m unaligned absolute + units: attempts + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of unauthorized v3/v5 SUBSCRIBE attempts in the last minute + to: sysadmin # UNSUBSCRIBE errors -template: vernemq_mqtt_unsubscribe_error - on: vernemq.mqtt_unsubscribe_error - lookup: sum -1m unaligned absolute - units: failed ops - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of failed v3/v5 UNSUBSCRIBE operations in the last minute - to: sysadmin + template: vernemq_mqtt_unsubscribe_error + on: vernemq.mqtt_unsubscribe_error + class: Messaging +component: VerneMQ + type: Errors + lookup: average -1m unaligned absolute + units: failed ops + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of failed v3/v5 UNSUBSCRIBE operations in the last minute + to: sysadmin # PUBLISH errors and unauthorized attempts -template: vernemq_mqtt_publish_errors - on: vernemq.mqtt_publish_errors - lookup: sum -1m unaligned absolute - units: failed ops - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of failed v3/v5 PUBLISH operations in the last minute - to: sysadmin - -template: vernemq_mqtt_publish_auth_errors - on: vernemq.mqtt_publish_auth_errors - lookup: sum -1m unaligned absolute - units: attempts - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of unauthorized v3/v5 PUBLISH attempts in the last minute - to: sysadmin + template: vernemq_mqtt_publish_errors + on: vernemq.mqtt_publish_errors + class: Messaging +component: VerneMQ + type: Errors + lookup: average -1m unaligned absolute + units: failed ops + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of failed v3/v5 PUBLISH operations in the last minute + to: sysadmin + + template: vernemq_mqtt_publish_auth_errors + on: vernemq.mqtt_publish_auth_errors + class: Messaging +component: VerneMQ + type: Workload + lookup: average -1m unaligned absolute + units: attempts + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of unauthorized v3/v5 PUBLISH attempts in the last minute + to: sysadmin # Unsuccessful and unexpected PUBACK -template: vernemq_mqtt_puback_received_reason_unsuccessful - on: vernemq.mqtt_puback_received_reason - lookup: sum -1m unaligned absolute match-names of !success,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of received unsuccessful v5 PUBACK packets in the last minute - to: sysadmin - -template: vernemq_mqtt_puback_sent_reason_unsuccessful - on: vernemq.mqtt_puback_sent_reason - lookup: sum -1m unaligned absolute match-names of !success,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of sent unsuccessful v5 PUBACK packets in the last minute - to: sysadmin - -template: vernemq_mqtt_puback_unexpected - on: vernemq.mqtt_puback_invalid_error - lookup: sum -1m unaligned absolute - units: messages - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of received unexpected v3/v5 PUBACK packets in the last minute - to: sysadmin + template: vernemq_mqtt_puback_received_reason_unsuccessful + on: vernemq.mqtt_puback_received_reason + class: Messaging +component: VerneMQ + type: Errors + lookup: average -1m unaligned absolute match-names of !success,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of received unsuccessful v5 PUBACK packets in the last minute + to: sysadmin + + template: vernemq_mqtt_puback_sent_reason_unsuccessful + on: vernemq.mqtt_puback_sent_reason + class: Messaging +component: VerneMQ + type: Errors + lookup: average -1m unaligned absolute match-names of !success,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of sent unsuccessful v5 PUBACK packets in the last minute + to: sysadmin + + template: vernemq_mqtt_puback_unexpected + on: vernemq.mqtt_puback_invalid_error + class: Messaging +component: VerneMQ + type: Workload + lookup: average -1m unaligned absolute + units: messages + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of received unexpected v3/v5 PUBACK packets in the last minute + to: sysadmin # Unsuccessful and unexpected PUBREC -template: vernemq_mqtt_pubrec_received_reason_unsuccessful - on: vernemq.mqtt_pubrec_received_reason - lookup: sum -1m unaligned absolute match-names of !success,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of received unsuccessful v5 PUBREC packets in the last minute - to: sysadmin - -template: vernemq_mqtt_pubrec_sent_reason_unsuccessful - on: vernemq.mqtt_pubrec_sent_reason - lookup: sum -1m unaligned absolute match-names of !success,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of sent unsuccessful v5 PUBREC packets in the last minute - to: sysadmin - -template: vernemq_mqtt_pubrec_invalid_error - on: vernemq.mqtt_pubrec_invalid_error - lookup: sum -1m unaligned absolute - units: messages - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of received unexpected v3 PUBREC packets in the last minute - to: sysadmin + template: vernemq_mqtt_pubrec_received_reason_unsuccessful + on: vernemq.mqtt_pubrec_received_reason + class: Messaging +component: VerneMQ + type: Errors + lookup: average -1m unaligned absolute match-names of !success,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of received unsuccessful v5 PUBREC packets in the last minute + to: sysadmin + + template: vernemq_mqtt_pubrec_sent_reason_unsuccessful + on: vernemq.mqtt_pubrec_sent_reason + class: Messaging +component: VerneMQ + type: Errors + lookup: average -1m unaligned absolute match-names of !success,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of sent unsuccessful v5 PUBREC packets in the last minute + to: sysadmin + + template: vernemq_mqtt_pubrec_invalid_error + on: vernemq.mqtt_pubrec_invalid_error + class: Messaging +component: VerneMQ + type: Workload + lookup: average -1m unaligned absolute + units: messages + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of received unexpected v3 PUBREC packets in the last minute + to: sysadmin # Unsuccessful PUBREL -template: vernemq_mqtt_pubrel_received_reason_unsuccessful - on: vernemq.mqtt_pubrel_received_reason - lookup: sum -1m unaligned absolute match-names of !success,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of received unsuccessful v5 PUBREL packets in the last minute - to: sysadmin - -template: vernemq_mqtt_pubrel_sent_reason_unsuccessful - on: vernemq.mqtt_pubrel_sent_reason - lookup: sum -1m unaligned absolute match-names of !success,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of sent unsuccessful v5 PUBREL packets in the last minute - to: sysadmin + template: vernemq_mqtt_pubrel_received_reason_unsuccessful + on: vernemq.mqtt_pubrel_received_reason + class: Messaging +component: VerneMQ + type: Errors + lookup: average -1m unaligned absolute match-names of !success,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of received unsuccessful v5 PUBREL packets in the last minute + to: sysadmin + + template: vernemq_mqtt_pubrel_sent_reason_unsuccessful + on: vernemq.mqtt_pubrel_sent_reason + class: Messaging +component: VerneMQ + type: Errors + lookup: average -1m unaligned absolute match-names of !success,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of sent unsuccessful v5 PUBREL packets in the last minute + to: sysadmin # Unsuccessful and unexpected PUBCOMP -template: vernemq_mqtt_pubcomp_received_reason_unsuccessful - on: vernemq.mqtt_pubcomp_received_reason - lookup: sum -1m unaligned absolute match-names of !success,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of received unsuccessful v5 PUBCOMP packets in the last minute - to: sysadmin - -template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful - on: vernemq.mqtt_pubcomp_sent_reason - lookup: sum -1m unaligned absolute match-names of !success,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of sent unsuccessful v5 PUBCOMP packets in the last minute - to: sysadmin - -template: vernemq_mqtt_pubcomp_unexpected - on: vernemq.mqtt_pubcomp_invalid_error - lookup: sum -1m unaligned absolute - units: messages - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 5m down 5m multiplier 1.5 max 2h - info: number of received unexpected v3/v5 PUBCOMP packets in the last minute - to: sysadmin + template: vernemq_mqtt_pubcomp_received_reason_unsuccessful + on: vernemq.mqtt_pubcomp_received_reason + class: Messaging +component: VerneMQ + type: Errors + lookup: average -1m unaligned absolute match-names of !success,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of received unsuccessful v5 PUBCOMP packets in the last minute + to: sysadmin + + template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful + on: vernemq.mqtt_pubcomp_sent_reason + class: Messaging +component: VerneMQ + type: Errors + lookup: average -1m unaligned absolute match-names of !success,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of sent unsuccessful v5 PUBCOMP packets in the last minute + to: sysadmin + + template: vernemq_mqtt_pubcomp_unexpected + on: vernemq.mqtt_pubcomp_invalid_error + class: Messaging +component: VerneMQ + type: Workload + lookup: average -1m unaligned absolute + units: messages + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of received unexpected v3/v5 PUBCOMP packets in the last minute + to: sysadmin |