From 87649cf32bd0e14d5a903fb85b01e9f41a253540 Mon Sep 17 00:00:00 2001 From: Federico Ceratto Date: Wed, 23 Nov 2016 15:49:10 +0000 Subject: New upstream version 1.4.0+dfsg --- plugins.d/alarm-notify.sh | 767 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 767 insertions(+) create mode 100755 plugins.d/alarm-notify.sh (limited to 'plugins.d/alarm-notify.sh') diff --git a/plugins.d/alarm-notify.sh b/plugins.d/alarm-notify.sh new file mode 100755 index 000000000..feec6ceae --- /dev/null +++ b/plugins.d/alarm-notify.sh @@ -0,0 +1,767 @@ +#!/usr/bin/env bash + +# netdata +# real-time performance and health monitoring, done right! +# (C) 2016 Costa Tsaousis +# GPL v3+ +# +# Script the send alarm notifications for netdata +# +# Features: +# - multiple notification methods +# - multiple roles per alarm +# - multiple recipients per role +# - severity filtering per recipient +# +# Supported notification methods: +# - emails +# - pushover.net notifications +# - slack.com notifications +# - telegram.org notifications +# + +me="${0}" + +# check for BASH v4+ (required for associative arrays) +[ $(( ${BASH_VERSINFO[0]} )) -lt 4 ] && \ + echo >&2 "${me}: BASH version 4 or later is required (this is ${BASH_VERSION})." && \ + exit 1 + +# defaults to allow running this script by hand +NETDATA_CONFIG_DIR="${NETDATA_CONFIG_DIR-/etc/netdata}" +NETDATA_CACHE_DIR="${NETDATA_CACHE_DIR-/var/cache/netdata}" +[ -z "${NETDATA_REGISTRY_URL}" ] && NETDATA_REGISTRY_URL="https://registry.my-netdata.io" +[ -z "${NETDATA_HOSTNAME}" ] && NETDATA_HOSTNAME="$(hostname)" +[ -z "${NETDATA_REGISTRY_HOSTNAME}" ] && NETDATA_REGISTRY_HOSTNAME="${NETDATA_HOSTNAME}" + +# ----------------------------------------------------------------------------- +# parse command line parameters + +roles="${1}" # the roles that should be notified for this event +host="${2}" # the host generated this event +unique_id="${3}" # the unique id of this event +alarm_id="${4}" # the unique id of the alarm that generated this event +event_id="${5}" # the incremental id of the event, for this alarm id +when="${6}" # the timestamp this event occurred +name="${7}" # the name of the alarm, as given in netdata health.d entries +chart="${8}" # the name of the chart (type.id) +family="${9}" # the family of the chart +status="${10}" # the current status : REMOVED, UNITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL +old_status="${11}" # the previous status: REMOVED, UNITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL +value="${12}" # the current value of the alarm +old_value="${13}" # the previous value of the alarm +src="${14}" # the line number and file the alarm has been configured +duration="${15}" # the duration in seconds of the previous alarm state +non_clear_duration="${16}" # the total duration in seconds this is/was non-clear +units="${17}" # the units of the value +info="${18}" # a short description of the alarm + +# ----------------------------------------------------------------------------- +# screen statuses we don't need to send a notification + +# don't do anything if this is not WARNING, CRITICAL or CLEAR +if [ "${status}" != "WARNING" -a "${status}" != "CRITICAL" -a "${status}" != "CLEAR" ] +then + echo >&2 "${me}: not sending notification for ${status} on '${chart}.${name}'" + exit 1 +fi + +# don't do anything if this is CLEAR, but it was not WARNING or CRITICAL +if [ "${old_status}" != "WARNING" -a "${old_status}" != "CRITICAL" -a "${status}" = "CLEAR" ] +then + echo >&2 "${me}: not sending notification for ${status} on '${chart}.${name}' (last status was ${old_status})" + exit 1 +fi + +# ----------------------------------------------------------------------------- +# load configuration + +# By default fetch images from the global public registry. +# This is required by default, since all notification methods need to download +# images via the Internet, and private registries might not be reachable. +# This can be overwritten at the configuration file. +images_base_url="https://registry.my-netdata.io" + +# needed commands +# if empty they will be searched in the system path +curl= +sendmail= + +# enable / disable features +SEND_SLACK="YES" +SEND_PUSHOVER="YES" +SEND_TELEGRAM="YES" +SEND_EMAIL="YES" + +# slack configs +SLACK_WEBHOOK_URL= +DEFAULT_RECIPIENT_SLACK= +declare -A role_recipients_slack=() + +# pushover configs +PUSHOVER_APP_TOKEN= +DEFAULT_RECIPIENT_PUSHOVER= +declare -A role_recipients_pushover=() + +# telegram configs +TELEGRAM_BOT_TOKEN= +DEFAULT_RECIPIENT_TELEGRAM= +declare -A role_recipients_telegram=() + +# email configs +DEFAULT_RECIPIENT_EMAIL="root" +declare -A role_recipients_email=() + +# load the user configuration +# this will overwrite the variables above +if [ -f "${NETDATA_CONFIG_DIR}/health_alarm_notify.conf" ] + then + source "${NETDATA_CONFIG_DIR}/health_alarm_notify.conf" +fi + +# ----------------------------------------------------------------------------- +# filter a recipient based on alarm event severity + +filter_recipient_by_criticality() { + local method="${1}" x="${2}" r s + shift + + r="${x/|*/}" # the recipient + s="${x/*|/}" # the severity required for notifying this recipient + + # no severity filtering for this person + [ "${r}" = "${s}" ] && return 0 + + # the severity is invalid + s="${s^^}" + [ "${s}" != "CRITICAL" ] && return 0 + + # the new or the old status matches the severity + if [ "${s}" = "${status}" -o "${s}" = "${old_status}" ] + then + [ ! -d "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}" ] && \ + mkdir -p "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}" + + # we need to keep track of the notifications we sent + # so that the same user will receive the recovery + # even if old_status does not match the required severity + touch "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}/${alarm_id}" + return 0 + fi + + # it is a cleared alarm we have sent notification for + if [ "${status}" != "WARNING" -a "${status}" != "CRITICAL" -a -f "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}/${alarm_id}" ] + then + rm "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}/${alarm_id}" + return 0 + fi + + return 1 +} + +# ----------------------------------------------------------------------------- +# find the recipients' addresses per method + +declare -A arr_slack=() +declare -A arr_pushover=() +declare -A arr_telegram=() +declare -A arr_email=() + +# netdata may call us with multiple roles, and roles may have multiple but +# overlapping recipients - so, here we find the unique recipients. +for x in ${roles//,/ } +do + # the roles 'silent' and 'disabled' mean: + # don't send a notification for this role + [ "${x}" = "silent" -o "${x}" = "disabled" ] && continue + + # email + a="${role_recipients_email[${x}]}" + [ -z "${a}" ] && a="${DEFAULT_RECIPIENT_EMAIL}" + for r in ${a//,/ } + do + [ "${r}" != "disabled" ] && filter_recipient_by_criticality email "${r}" && arr_email[${r/|*/}]="1" + done + + # pushover + a="${role_recipients_pushover[${x}]}" + [ -z "${a}" ] && a="${DEFAULT_RECIPIENT_PUSHOVER}" + for r in ${a//,/ } + do + [ "${r}" != "disabled" ] && filter_recipient_by_criticality pushover "${r}" && arr_pushover[${r/|*/}]="1" + done + + # telegram + a="${role_recipients_telegram[${x}]}" + [ -z "${a}" ] && a="${DEFAULT_RECIPIENT_TELEGRAM}" + for r in ${a//,/ } + do + [ "${r}" != "disabled" ] && filter_recipient_by_criticality telegram "${r}" && arr_telegram[${r/|*/}]="1" + done + + # slack + a="${role_recipients_slack[${x}]}" + [ -z "${a}" ] && a="${DEFAULT_RECIPIENT_SLACK}" + for r in ${a//,/ } + do + [ "${r}" != "disabled" ] && filter_recipient_by_criticality slack "${r}" && arr_slack[${r/|*/}]="1" + done +done + +# build the list of slack recipients (channels) +to_slack="${!arr_slack[*]}" +[ -z "${to_slack}" ] && SEND_SLACK="NO" + +# build the list of pushover recipients (user tokens) +to_pushover="${!arr_pushover[*]}" +[ -z "${to_pushover}" ] && SEND_PUSHOVER="NO" + +# check array of telegram recipients (chat ids) +to_telegram="${!arr_telegram[*]}" +[ -z "${to_telegram}" ] && SEND_TELEGRAM="NO" + +# build the list of email recipients (email addresses) +to_email= +for x in "${!arr_email[@]}" +do + [ ! -z "${to_email}" ] && to_email="${to_email}, " + to_email="${to_email}${x}" +done +[ -z "${to_email}" ] && SEND_EMAIL="NO" + + +# ----------------------------------------------------------------------------- +# verify the delivery methods supported + +# check slack +[ -z "${SLACK_WEBHOOK_URL}" ] && SEND_SLACK="NO" + +# check pushover +[ -z "${PUSHOVER_APP_TOKEN}" ] && SEND_PUSHOVER="NO" + +# check telegram +[ -z "${TELEGRAM_BOT_TOKEN}" ] && SEND_TELEGRAM="NO" + +if [ \( "${SEND_PUSHOVER}" = "YES" -o "${SEND_SLACK}" = "YES" -o "${SEND_TELEGRAM}" = "YES" \) -a -z "${curl}" ] + then + curl="$(which curl 2>/dev/null || command -v curl 2>/dev/null)" + if [ -z "${curl}" ] + then + SEND_PUSHOVER="NO" + SEND_TELEGRAM="NO" + SEND_SLACK="NO" + fi +fi + +if [ "${SEND_EMAIL}" = "YES" -a -z "${sendmail}" ] + then + sendmail="$(which sendmail 2>/dev/null || command -v sendmail 2>/dev/null)" + [ -z "${sendmail}" ] && SEND_EMAIL="NO" +fi + +# check that we have at least a method enabled +if [ "${SEND_EMAIL}" != "YES" -a "${SEND_PUSHOVER}" != "YES" -a "${SEND_TELEGRAM}" != "YES" -a "${SEND_SLACK}" != "YES" ] + then + echo >&2 "All notification methods are disabled. Not sending a notification." + exit 1 +fi + +# ----------------------------------------------------------------------------- +# get the system hostname + +[ -z "${host}" ] && host="${NETDATA_HOSTNAME}" +[ -z "${host}" ] && host="${NETDATA_REGISTRY_HOSTNAME}" +[ -z "${host}" ] && host="$(hostname 2>/dev/null)" + +# ----------------------------------------------------------------------------- +# get the date the alarm happened + +date="$(date --date=@${when} 2>/dev/null)" +[ -z "${date}" ] && date="$(date 2>/dev/null)" + +# ----------------------------------------------------------------------------- +# URL encode a string + +urlencode() { + local string="${1}" strlen encoded pos c o + + strlen=${#string} + for (( pos=0 ; pos&2 "${me}: Sent email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}'" + return 0 + else + echo >&2 "${me}: Failed to send email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}' with error code ${ret}." + return 1 + fi + fi + + return 1 +} + +# ----------------------------------------------------------------------------- +# pushover sender + +send_pushover() { + local apptoken="${1}" usertokens="${2}" when="${3}" url="${4}" status="${5}" title="${6}" message="${7}" httpcode sent=0 user priority + + if [ "${SEND_PUSHOVER}" = "YES" -a ! -z "${apptoken}" -a ! -z "${usertokens}" -a ! -z "${title}" -a ! -z "${message}" ] + then + + # https://pushover.net/api + priority=-2 + case "${status}" in + CLEAR) priority=-1;; # low priority: no sound or vibration + WARNING) priotity=0;; # normal priority: respect quiet hours + CRITICAL) priority=1;; # high priority: bypass quiet hours + *) priority=-2;; # lowest priority: no notification at all + esac + + for user in ${usertokens} + do + httpcode=$(${curl} --write-out %{http_code} --silent --output /dev/null \ + --form-string "token=${apptoken}" \ + --form-string "user=${user}" \ + --form-string "html=1" \ + --form-string "title=${title}" \ + --form-string "message=${message}" \ + --form-string "timestamp=${when}" \ + --form-string "url=${url}" \ + --form-string "url_title=Open netdata dashboard to view the alarm" \ + --form-string "priority=${priority}" \ + https://api.pushover.net/1/messages.json) + + if [ "${httpcode}" == "200" ] + then + echo >&2 "${me}: Sent pushover notification for: ${host} ${chart}.${name} is ${status} to '${user}'" + sent=$((sent + 1)) + else + echo >&2 "${me}: Failed to send pushover notification for: ${host} ${chart}.${name} is ${status} to '${user}' with HTTP error code ${httpcode}." + fi + done + + [ ${sent} -gt 0 ] && return 0 + fi + + return 1 +} + + +# ----------------------------------------------------------------------------- +# telegram sender + +send_telegram() { + local bottoken="${1}" chatids="${2}" message="${3}" httpcode sent=0 chatid disableNotification="" + + if [ "${status}" = "CLEAR" ]; then disableNotification="--data-urlencode disable_notification=true"; fi + + if [ "${SEND_TELEGRAM}" = "YES" -a ! -z "${bottoken}" -a ! -z "${chatids}" -a ! -z "${message}" ]; + then + for chatid in ${chatids} + do + # https://core.telegram.org/bots/api#sendmessage + httpcode=$(${curl} --write-out %{http_code} --silent --output /dev/null ${disableNotification} \ + --data-urlencode "parse_mode=HTML" \ + --data-urlencode "disable_web_page_preview=true" \ + --data-urlencode "text=$message" \ + "https://api.telegram.org/bot${bottoken}/sendMessage?chat_id=$chatid") + + if [ "${httpcode}" == "200" ] + then + echo >&2 "${me}: Sent telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}'" + sent=$((sent + 1)) + elif [ "${httpcode}" == "401" ] + then + echo >&2 "${me}: Failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}': Wrong bot token." + else + echo >&2 "${me}: Failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}' with HTTP error code ${httpcode}." + fi + done + + [ ${sent} -gt 0 ] && return 0 + fi + + return 1 +} + +# ----------------------------------------------------------------------------- +# slack sender + +send_slack() { + local webhook="${1}" channels="${2}" httpcode sent=0 channel color payload + + [ "${SEND_SLACK}" != "YES" ] && return 1 + + case "${status}" in + WARNING) color="warning" ;; + CRITICAL) color="danger" ;; + CLEAR) color="good" ;; + *) color="#777777" ;; + esac + + for channel in ${channels} + do + payload="$(cat <", + "ts": ${when} + } + ] + } +EOF + )" + + httpcode=$(${curl} --write-out %{http_code} --silent --output /dev/null -X POST --data-urlencode "payload=${payload}" "${webhook}") + if [ "${httpcode}" == "200" ] + then + echo >&2 "${me}: Sent slack notification for: ${host} ${chart}.${name} is ${status} to '${channel}'" + sent=$((sent + 1)) + else + echo >&2 "${me}: Failed to send slack notification for: ${host} ${chart}.${name} is ${status} to '${channel}', with HTTP error code ${httpcode}." + fi + done + + [ ${sent} -gt 0 ] && return 0 + + return 1 +} + + +# ----------------------------------------------------------------------------- +# prepare the content of the notification + +# the url to send the user on click +urlencode "${NETDATA_REGISTRY_HOSTNAME}" >/dev/null; url_host="${REPLY}" +urlencode "${chart}" >/dev/null; url_chart="${REPLY}" +urlencode "${family}" >/dev/null; url_family="${REPLY}" +urlencode "${name}" >/dev/null; url_name="${REPLY}" +goto_url="${NETDATA_REGISTRY_URL}/goto-host-from-alarm.html?host=${url_host}&chart=${url_chart}&family=${url_family}&alarm=${url_name}&alarm_unique_id=${unique_id}&alarm_id=${alarm_id}&alarm_event_id=${event_id}" + +# the severity of the alarm +severity="${status}" + +# the time the alarm was raised +duration4human ${duration} >/dev/null; duration_txt="${REPLY}" +duration4human ${non_clear_duration} >/dev/null; non_clear_duration_txt="${REPLY}" +raised_for="(was ${old_status,,} for ${duration_txt})" + +# the key status message +status_message="status unknown" + +# the color of the alarm +color="grey" + +# the alarm value +alarm="${name//_/ } = ${value} ${units}" + +# the image of the alarm +image="${images_base_url}/images/seo-performance-128.png" + +# prepare the title based on status +case "${status}" in + CRITICAL) + image="${images_base_url}/images/alert-128-red.png" + status_message="is critical" + color="#ca414b" + ;; + + WARNING) + image="${images_base_url}/images/alert-128-orange.png" + status_message="needs attention" + color="#caca4b" + ;; + + CLEAR) + image="${images_base_url}/images/check-mark-2-128-green.png" + status_message="recovered" + color="#77ca6d" + + # don't show the value when the status is CLEAR + # for certain alarms, this value might not have any meaning + alarm="${name//_/ } ${raised_for}" + ;; +esac + +if [ "${status}" = "CLEAR" ] +then + severity="Recovered from ${old_status}" + if [ $non_clear_duration -gt $duration ] + then + raised_for="(alarm was raised for ${non_clear_duration_txt})" + fi + +elif [ "${old_status}" = "WARNING" -a "${status}" = "CRITICAL" ] +then + severity="Escalated to ${status}" + if [ $non_clear_duration -gt $duration ] + then + raised_for="(alarm is raised for ${non_clear_duration_txt})" + fi + +elif [ "${old_status}" = "CRITICAL" -a "${status}" = "WARNING" ] +then + severity="Demoted to ${status}" + if [ $non_clear_duration -gt $duration ] + then + raised_for="(alarm is raised for ${non_clear_duration_txt})" + fi + +else + raised_for= +fi + +# prepare HTML versions of elements +info_html= +[ ! -z "${info}" ] && info_html="
${info}
" + +raised_for_html= +[ ! -z "${raised_for}" ] && raised_for_html="
${raised_for}" + +# ----------------------------------------------------------------------------- +# send the slack notification + +# slack aggregates posts from the same username +# so we use "${host} ${status}" as the bot username, to make them diff + +send_slack "${SLACK_WEBHOOK_URL}" "${to_slack}" +SENT_SLACK=$? + +# ----------------------------------------------------------------------------- +# send the pushover notification + +send_pushover "${PUSHOVER_APP_TOKEN}" "${to_pushover}" "${when}" "${goto_url}" "${status}" "${host} ${status_message} - ${name//_/ } - ${chart}" " +${alarm}${info_html}
  +${chart}
Chart
 
+${family}
Family
 
+${severity}
Severity
 
+${date}${raised_for_html}
Time
 
+View Netdata
  +The source of this alarm is line ${src} +" + +SENT_PUSHOVER=$? + +# ----------------------------------------------------------------------------- +# send the telegram.org message + +# https://core.telegram.org/bots/api#formatting-options +telegram_message="${severity}" +[ "${status_message}" != "recovered" ] && telegram_message="${telegram_message}, ${status_message}" +telegram_message="${telegram_message} +${chart} (${family}) +${alarm} +${info}" + +send_telegram "${TELEGRAM_BOT_TOKEN}" "${to_telegram}" "${telegram_message}" + +SENT_TELEGRAM=$? + +# ----------------------------------------------------------------------------- +# send the email + +send_email < + + + + + + + + + +
+
+ + + + + + + + + + + + +
+
netdata notification
+
+

${host} ${status_message}

+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ ${chart} + Chart +
+ ${alarm}${info_html} + Alarm +
+ ${family} + Family +
+ ${severity} + Severity +
${date} + ${raised_for_html} Time +
+ View Netdata +
The source of this alarm is line ${src} +
Sent by + netdata, the real-time performance monitoring. +
+
+
+
+
+ + +EOF + +SENT_EMAIL=$? + +# ----------------------------------------------------------------------------- +# let netdata know + +# we did send something +[ ${SENT_EMAIL} -eq 0 -o ${SENT_PUSHOVER} -eq 0 -o ${SENT_TELEGRAM} -eq 0 -o ${SENT_SLACK} -eq 0 ] && exit 0 + +# we did not send anything +exit 1 -- cgit v1.2.3