#!/usr/bin/env bash # netdata # real-time performance and health monitoring, done right! # (C) 2016 Costa Tsaousis # GPL v3+ # # Script the send alarm notifications for netdata # # Features: # - multiple notification methods # - multiple roles per alarm # - multiple recipients per role # - severity filtering per recipient # # Supported notification methods: # - emails # - pushover.net notifications # - slack.com notifications # - telegram.org notifications # me="${0}" # check for BASH v4+ (required for associative arrays) [ $(( ${BASH_VERSINFO[0]} )) -lt 4 ] && \ echo >&2 "${me}: BASH version 4 or later is required (this is ${BASH_VERSION})." && \ exit 1 # defaults to allow running this script by hand NETDATA_CONFIG_DIR="${NETDATA_CONFIG_DIR-/etc/netdata}" NETDATA_CACHE_DIR="${NETDATA_CACHE_DIR-/var/cache/netdata}" [ -z "${NETDATA_REGISTRY_URL}" ] && NETDATA_REGISTRY_URL="https://registry.my-netdata.io" [ -z "${NETDATA_HOSTNAME}" ] && NETDATA_HOSTNAME="$(hostname)" [ -z "${NETDATA_REGISTRY_HOSTNAME}" ] && NETDATA_REGISTRY_HOSTNAME="${NETDATA_HOSTNAME}" # ----------------------------------------------------------------------------- # parse command line parameters roles="${1}" # the roles that should be notified for this event host="${2}" # the host generated this event unique_id="${3}" # the unique id of this event alarm_id="${4}" # the unique id of the alarm that generated this event event_id="${5}" # the incremental id of the event, for this alarm id when="${6}" # the timestamp this event occurred name="${7}" # the name of the alarm, as given in netdata health.d entries chart="${8}" # the name of the chart (type.id) family="${9}" # the family of the chart status="${10}" # the current status : REMOVED, UNITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL old_status="${11}" # the previous status: REMOVED, UNITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL value="${12}" # the current value of the alarm old_value="${13}" # the previous value of the alarm src="${14}" # the line number and file the alarm has been configured duration="${15}" # the duration in seconds of the previous alarm state non_clear_duration="${16}" # the total duration in seconds this is/was non-clear units="${17}" # the units of the value info="${18}" # a short description of the alarm # ----------------------------------------------------------------------------- # screen statuses we don't need to send a notification # don't do anything if this is not WARNING, CRITICAL or CLEAR if [ "${status}" != "WARNING" -a "${status}" != "CRITICAL" -a "${status}" != "CLEAR" ] then echo >&2 "${me}: not sending notification for ${status} on '${chart}.${name}'" exit 1 fi # don't do anything if this is CLEAR, but it was not WARNING or CRITICAL if [ "${old_status}" != "WARNING" -a "${old_status}" != "CRITICAL" -a "${status}" = "CLEAR" ] then echo >&2 "${me}: not sending notification for ${status} on '${chart}.${name}' (last status was ${old_status})" exit 1 fi # ----------------------------------------------------------------------------- # load configuration # By default fetch images from the global public registry. # This is required by default, since all notification methods need to download # images via the Internet, and private registries might not be reachable. # This can be overwritten at the configuration file. images_base_url="https://registry.my-netdata.io" # needed commands # if empty they will be searched in the system path curl= sendmail= # enable / disable features SEND_SLACK="YES" SEND_PUSHOVER="YES" SEND_TELEGRAM="YES" SEND_EMAIL="YES" # slack configs SLACK_WEBHOOK_URL= DEFAULT_RECIPIENT_SLACK= declare -A role_recipients_slack=() # pushover configs PUSHOVER_APP_TOKEN= DEFAULT_RECIPIENT_PUSHOVER= declare -A role_recipients_pushover=() # telegram configs TELEGRAM_BOT_TOKEN= DEFAULT_RECIPIENT_TELEGRAM= declare -A role_recipients_telegram=() # email configs DEFAULT_RECIPIENT_EMAIL="root" declare -A role_recipients_email=() # load the user configuration # this will overwrite the variables above if [ -f "${NETDATA_CONFIG_DIR}/health_alarm_notify.conf" ] then source "${NETDATA_CONFIG_DIR}/health_alarm_notify.conf" fi # ----------------------------------------------------------------------------- # filter a recipient based on alarm event severity filter_recipient_by_criticality() { local method="${1}" x="${2}" r s shift r="${x/|*/}" # the recipient s="${x/*|/}" # the severity required for notifying this recipient # no severity filtering for this person [ "${r}" = "${s}" ] && return 0 # the severity is invalid s="${s^^}" [ "${s}" != "CRITICAL" ] && return 0 # the new or the old status matches the severity if [ "${s}" = "${status}" -o "${s}" = "${old_status}" ] then [ ! -d "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}" ] && \ mkdir -p "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}" # we need to keep track of the notifications we sent # so that the same user will receive the recovery # even if old_status does not match the required severity touch "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}/${alarm_id}" return 0 fi # it is a cleared alarm we have sent notification for if [ "${status}" != "WARNING" -a "${status}" != "CRITICAL" -a -f "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}/${alarm_id}" ] then rm "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}/${alarm_id}" return 0 fi return 1 } # ----------------------------------------------------------------------------- # find the recipients' addresses per method declare -A arr_slack=() declare -A arr_pushover=() declare -A arr_telegram=() declare -A arr_email=() # netdata may call us with multiple roles, and roles may have multiple but # overlapping recipients - so, here we find the unique recipients. for x in ${roles//,/ } do # the roles 'silent' and 'disabled' mean: # don't send a notification for this role [ "${x}" = "silent" -o "${x}" = "disabled" ] && continue # email a="${role_recipients_email[${x}]}" [ -z "${a}" ] && a="${DEFAULT_RECIPIENT_EMAIL}" for r in ${a//,/ } do [ "${r}" != "disabled" ] && filter_recipient_by_criticality email "${r}" && arr_email[${r/|*/}]="1" done # pushover a="${role_recipients_pushover[${x}]}" [ -z "${a}" ] && a="${DEFAULT_RECIPIENT_PUSHOVER}" for r in ${a//,/ } do [ "${r}" != "disabled" ] && filter_recipient_by_criticality pushover "${r}" && arr_pushover[${r/|*/}]="1" done # telegram a="${role_recipients_telegram[${x}]}" [ -z "${a}" ] && a="${DEFAULT_RECIPIENT_TELEGRAM}" for r in ${a//,/ } do [ "${r}" != "disabled" ] && filter_recipient_by_criticality telegram "${r}" && arr_telegram[${r/|*/}]="1" done # slack a="${role_recipients_slack[${x}]}" [ -z "${a}" ] && a="${DEFAULT_RECIPIENT_SLACK}" for r in ${a//,/ } do [ "${r}" != "disabled" ] && filter_recipient_by_criticality slack "${r}" && arr_slack[${r/|*/}]="1" done done # build the list of slack recipients (channels) to_slack="${!arr_slack[*]}" [ -z "${to_slack}" ] && SEND_SLACK="NO" # build the list of pushover recipients (user tokens) to_pushover="${!arr_pushover[*]}" [ -z "${to_pushover}" ] && SEND_PUSHOVER="NO" # check array of telegram recipients (chat ids) to_telegram="${!arr_telegram[*]}" [ -z "${to_telegram}" ] && SEND_TELEGRAM="NO" # build the list of email recipients (email addresses) to_email= for x in "${!arr_email[@]}" do [ ! -z "${to_email}" ] && to_email="${to_email}, " to_email="${to_email}${x}" done [ -z "${to_email}" ] && SEND_EMAIL="NO" # ----------------------------------------------------------------------------- # verify the delivery methods supported # check slack [ -z "${SLACK_WEBHOOK_URL}" ] && SEND_SLACK="NO" # check pushover [ -z "${PUSHOVER_APP_TOKEN}" ] && SEND_PUSHOVER="NO" # check telegram [ -z "${TELEGRAM_BOT_TOKEN}" ] && SEND_TELEGRAM="NO" if [ \( "${SEND_PUSHOVER}" = "YES" -o "${SEND_SLACK}" = "YES" -o "${SEND_TELEGRAM}" = "YES" \) -a -z "${curl}" ] then curl="$(which curl 2>/dev/null || command -v curl 2>/dev/null)" if [ -z "${curl}" ] then SEND_PUSHOVER="NO" SEND_TELEGRAM="NO" SEND_SLACK="NO" fi fi if [ "${SEND_EMAIL}" = "YES" -a -z "${sendmail}" ] then sendmail="$(which sendmail 2>/dev/null || command -v sendmail 2>/dev/null)" [ -z "${sendmail}" ] && SEND_EMAIL="NO" fi # check that we have at least a method enabled if [ "${SEND_EMAIL}" != "YES" -a "${SEND_PUSHOVER}" != "YES" -a "${SEND_TELEGRAM}" != "YES" -a "${SEND_SLACK}" != "YES" ] then echo >&2 "All notification methods are disabled. Not sending a notification." exit 1 fi # ----------------------------------------------------------------------------- # get the system hostname [ -z "${host}" ] && host="${NETDATA_HOSTNAME}" [ -z "${host}" ] && host="${NETDATA_REGISTRY_HOSTNAME}" [ -z "${host}" ] && host="$(hostname 2>/dev/null)" # ----------------------------------------------------------------------------- # get the date the alarm happened date="$(date --date=@${when} 2>/dev/null)" [ -z "${date}" ] && date="$(date 2>/dev/null)" # ----------------------------------------------------------------------------- # URL encode a string urlencode() { local string="${1}" strlen encoded pos c o strlen=${#string} for (( pos=0 ; pos&2 "${me}: Sent email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}'" return 0 else echo >&2 "${me}: Failed to send email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}' with error code ${ret}." return 1 fi fi return 1 } # ----------------------------------------------------------------------------- # pushover sender send_pushover() { local apptoken="${1}" usertokens="${2}" when="${3}" url="${4}" status="${5}" title="${6}" message="${7}" httpcode sent=0 user priority if [ "${SEND_PUSHOVER}" = "YES" -a ! -z "${apptoken}" -a ! -z "${usertokens}" -a ! -z "${title}" -a ! -z "${message}" ] then # https://pushover.net/api priority=-2 case "${status}" in CLEAR) priority=-1;; # low priority: no sound or vibration WARNING) priotity=0;; # normal priority: respect quiet hours CRITICAL) priority=1;; # high priority: bypass quiet hours *) priority=-2;; # lowest priority: no notification at all esac for user in ${usertokens} do httpcode=$(${curl} --write-out %{http_code} --silent --output /dev/null \ --form-string "token=${apptoken}" \ --form-string "user=${user}" \ --form-string "html=1" \ --form-string "title=${title}" \ --form-string "message=${message}" \ --form-string "timestamp=${when}" \ --form-string "url=${url}" \ --form-string "url_title=Open netdata dashboard to view the alarm" \ --form-string "priority=${priority}" \ https://api.pushover.net/1/messages.json) if [ "${httpcode}" == "200" ] then echo >&2 "${me}: Sent pushover notification for: ${host} ${chart}.${name} is ${status} to '${user}'" sent=$((sent + 1)) else echo >&2 "${me}: Failed to send pushover notification for: ${host} ${chart}.${name} is ${status} to '${user}' with HTTP error code ${httpcode}." fi done [ ${sent} -gt 0 ] && return 0 fi return 1 } # ----------------------------------------------------------------------------- # telegram sender send_telegram() { local bottoken="${1}" chatids="${2}" message="${3}" httpcode sent=0 chatid disableNotification="" if [ "${status}" = "CLEAR" ]; then disableNotification="--data-urlencode disable_notification=true"; fi if [ "${SEND_TELEGRAM}" = "YES" -a ! -z "${bottoken}" -a ! -z "${chatids}" -a ! -z "${message}" ]; then for chatid in ${chatids} do # https://core.telegram.org/bots/api#sendmessage httpcode=$(${curl} --write-out %{http_code} --silent --output /dev/null ${disableNotification} \ --data-urlencode "parse_mode=HTML" \ --data-urlencode "disable_web_page_preview=true" \ --data-urlencode "text=$message" \ "https://api.telegram.org/bot${bottoken}/sendMessage?chat_id=$chatid") if [ "${httpcode}" == "200" ] then echo >&2 "${me}: Sent telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}'" sent=$((sent + 1)) elif [ "${httpcode}" == "401" ] then echo >&2 "${me}: Failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}': Wrong bot token." else echo >&2 "${me}: Failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}' with HTTP error code ${httpcode}." fi done [ ${sent} -gt 0 ] && return 0 fi return 1 } # ----------------------------------------------------------------------------- # slack sender send_slack() { local webhook="${1}" channels="${2}" httpcode sent=0 channel color payload [ "${SEND_SLACK}" != "YES" ] && return 1 case "${status}" in WARNING) color="warning" ;; CRITICAL) color="danger" ;; CLEAR) color="good" ;; *) color="#777777" ;; esac for channel in ${channels} do payload="$(cat <", "ts": ${when} } ] } EOF )" httpcode=$(${curl} --write-out %{http_code} --silent --output /dev/null -X POST --data-urlencode "payload=${payload}" "${webhook}") if [ "${httpcode}" == "200" ] then echo >&2 "${me}: Sent slack notification for: ${host} ${chart}.${name} is ${status} to '${channel}'" sent=$((sent + 1)) else echo >&2 "${me}: Failed to send slack notification for: ${host} ${chart}.${name} is ${status} to '${channel}', with HTTP error code ${httpcode}." fi done [ ${sent} -gt 0 ] && return 0 return 1 } # ----------------------------------------------------------------------------- # prepare the content of the notification # the url to send the user on click urlencode "${NETDATA_REGISTRY_HOSTNAME}" >/dev/null; url_host="${REPLY}" urlencode "${chart}" >/dev/null; url_chart="${REPLY}" urlencode "${family}" >/dev/null; url_family="${REPLY}" urlencode "${name}" >/dev/null; url_name="${REPLY}" goto_url="${NETDATA_REGISTRY_URL}/goto-host-from-alarm.html?host=${url_host}&chart=${url_chart}&family=${url_family}&alarm=${url_name}&alarm_unique_id=${unique_id}&alarm_id=${alarm_id}&alarm_event_id=${event_id}" # the severity of the alarm severity="${status}" # the time the alarm was raised duration4human ${duration} >/dev/null; duration_txt="${REPLY}" duration4human ${non_clear_duration} >/dev/null; non_clear_duration_txt="${REPLY}" raised_for="(was ${old_status,,} for ${duration_txt})" # the key status message status_message="status unknown" # the color of the alarm color="grey" # the alarm value alarm="${name//_/ } = ${value} ${units}" # the image of the alarm image="${images_base_url}/images/seo-performance-128.png" # prepare the title based on status case "${status}" in CRITICAL) image="${images_base_url}/images/alert-128-red.png" status_message="is critical" color="#ca414b" ;; WARNING) image="${images_base_url}/images/alert-128-orange.png" status_message="needs attention" color="#caca4b" ;; CLEAR) image="${images_base_url}/images/check-mark-2-128-green.png" status_message="recovered" color="#77ca6d" # don't show the value when the status is CLEAR # for certain alarms, this value might not have any meaning alarm="${name//_/ } ${raised_for}" ;; esac if [ "${status}" = "CLEAR" ] then severity="Recovered from ${old_status}" if [ $non_clear_duration -gt $duration ] then raised_for="(alarm was raised for ${non_clear_duration_txt})" fi elif [ "${old_status}" = "WARNING" -a "${status}" = "CRITICAL" ] then severity="Escalated to ${status}" if [ $non_clear_duration -gt $duration ] then raised_for="(alarm is raised for ${non_clear_duration_txt})" fi elif [ "${old_status}" = "CRITICAL" -a "${status}" = "WARNING" ] then severity="Demoted to ${status}" if [ $non_clear_duration -gt $duration ] then raised_for="(alarm is raised for ${non_clear_duration_txt})" fi else raised_for= fi # prepare HTML versions of elements info_html= [ ! -z "${info}" ] && info_html="
${info}
" raised_for_html= [ ! -z "${raised_for}" ] && raised_for_html="
${raised_for}" # ----------------------------------------------------------------------------- # send the slack notification # slack aggregates posts from the same username # so we use "${host} ${status}" as the bot username, to make them diff send_slack "${SLACK_WEBHOOK_URL}" "${to_slack}" SENT_SLACK=$? # ----------------------------------------------------------------------------- # send the pushover notification send_pushover "${PUSHOVER_APP_TOKEN}" "${to_pushover}" "${when}" "${goto_url}" "${status}" "${host} ${status_message} - ${name//_/ } - ${chart}" " ${alarm}${info_html}
  ${chart}
Chart
 
${family}
Family
 
${severity}
Severity
 
${date}${raised_for_html}
Time
 
View Netdata
  The source of this alarm is line ${src} " SENT_PUSHOVER=$? # ----------------------------------------------------------------------------- # send the telegram.org message # https://core.telegram.org/bots/api#formatting-options telegram_message="${severity}" [ "${status_message}" != "recovered" ] && telegram_message="${telegram_message}, ${status_message}" telegram_message="${telegram_message} ${chart} (${family}) ${alarm} ${info}" send_telegram "${TELEGRAM_BOT_TOKEN}" "${to_telegram}" "${telegram_message}" SENT_TELEGRAM=$? # ----------------------------------------------------------------------------- # send the email send_email <
netdata notification

${host} ${status_message}

${chart} Chart
${alarm}${info_html} Alarm
${family} Family
${severity} Severity
${date} ${raised_for_html} Time
View Netdata
The source of this alarm is line ${src}
Sent by netdata, the real-time performance monitoring.
EOF SENT_EMAIL=$? # ----------------------------------------------------------------------------- # let netdata know # we did send something [ ${SENT_EMAIL} -eq 0 -o ${SENT_PUSHOVER} -eq 0 -o ${SENT_TELEGRAM} -eq 0 -o ${SENT_SLACK} -eq 0 ] && exit 0 # we did not send anything exit 1