From 87649cf32bd0e14d5a903fb85b01e9f41a253540 Mon Sep 17 00:00:00 2001 From: Federico Ceratto Date: Wed, 23 Nov 2016 15:49:10 +0000 Subject: New upstream version 1.4.0+dfsg --- plugins.d/Makefile.am | 1 + plugins.d/Makefile.in | 13 +- plugins.d/alarm-email.sh | 264 +-------------- plugins.d/alarm-notify.sh | 767 +++++++++++++++++++++++++++++++++++++++++++ plugins.d/cgroup-name.sh | 2 +- plugins.d/charts.d.plugin | 99 +++--- plugins.d/loopsleepms.sh.inc | 175 +++++++--- plugins.d/tc-qos-helper.sh | 7 - 8 files changed, 964 insertions(+), 364 deletions(-) create mode 100755 plugins.d/alarm-notify.sh (limited to 'plugins.d') diff --git a/plugins.d/Makefile.am b/plugins.d/Makefile.am index b8a28610a..4bc0dc447 100644 --- a/plugins.d/Makefile.am +++ b/plugins.d/Makefile.am @@ -9,6 +9,7 @@ dist_plugins_DATA = \ dist_plugins_SCRIPTS = \ alarm-email.sh \ + alarm-notify.sh \ cgroup-name.sh \ charts.d.dryrun-helper.sh \ charts.d.plugin \ diff --git a/plugins.d/Makefile.in b/plugins.d/Makefile.in index 06211d51c..1854ea861 100644 --- a/plugins.d/Makefile.in +++ b/plugins.d/Makefile.in @@ -83,8 +83,13 @@ subdir = plugins.d DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \ $(dist_plugins_SCRIPTS) $(dist_plugins_DATA) ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 -am__aclocal_m4_deps = $(top_srcdir)/m4/ax_pthread.m4 \ - $(top_srcdir)/configure.ac +am__aclocal_m4_deps = $(top_srcdir)/m4/ax_c___atomic.m4 \ + $(top_srcdir)/m4/ax_c__generic.m4 \ + $(top_srcdir)/m4/ax_c_mallinfo.m4 \ + $(top_srcdir)/m4/ax_c_mallopt.m4 \ + $(top_srcdir)/m4/ax_check_compile_flag.m4 \ + $(top_srcdir)/m4/ax_pthread.m4 $(top_srcdir)/m4/jemalloc.m4 \ + $(top_srcdir)/m4/tcmalloc.m4 $(top_srcdir)/configure.ac am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ $(ACLOCAL_M4) mkinstalldirs = $(install_sh) -d @@ -208,6 +213,7 @@ PTHREAD_CFLAGS = @PTHREAD_CFLAGS@ PTHREAD_LIBS = @PTHREAD_LIBS@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ +SSE_CANDIDATE = @SSE_CANDIDATE@ STRIP = @STRIP@ UUID_CFLAGS = @UUID_CFLAGS@ UUID_LIBS = @UUID_LIBS@ @@ -240,6 +246,8 @@ datarootdir = @datarootdir@ docdir = @docdir@ dvidir = @dvidir@ exec_prefix = @exec_prefix@ +has_jemalloc = @has_jemalloc@ +has_tcmalloc = @has_tcmalloc@ host = @host@ host_alias = @host_alias@ host_cpu = @host_cpu@ @@ -286,6 +294,7 @@ dist_plugins_DATA = \ dist_plugins_SCRIPTS = \ alarm-email.sh \ + alarm-notify.sh \ cgroup-name.sh \ charts.d.dryrun-helper.sh \ charts.d.plugin \ diff --git a/plugins.d/alarm-email.sh b/plugins.d/alarm-email.sh index 78c79ccdb..df083c655 100755 --- a/plugins.d/alarm-email.sh +++ b/plugins.d/alarm-email.sh @@ -1,264 +1,6 @@ #!/usr/bin/env bash -me="${0}" +# OBSOLETE - REPLACED WITH +# alarm-notify.sh -sendmail="$(which sendmail 2>/dev/null || command -v sendmail 2>/dev/null)" -if [ -z "${sendmail}" ] -then - echo >&2 "I cannot send emails - there is no sendmail command available." -fi - -sendmail_from_pipe() { - "${sendmail}" -t - - if [ $? -eq 0 ] - then - echo >&2 "${me}: Sent notification email for ${status} on '${chart}.${name}'" - return 0 - else - echo >&2 "${me}: FAILED to send notification email for ${status} on '${chart}.${name}'" - return 1 - fi -} - -name="${1}" # the name of the alarm, as given in netdata health.d entries -chart="${2}" # the name of the chart (type.id) -family="${3}" # the family of the chart -status="${4}" # the current status : UNITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL -old_status="${5}" # the previous status: UNITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL -value="${6}" # the current value -old_value="${7}" # the previous value -src="${8}" # the line number and file the alarm has been configured -duration="${9}" # the duration in seconds the previous state took -non_clear_duration="${10}" # the total duration in seconds this is non-clear -units="${11}" # the units of the value -info="${12}" # a short description of the alarm - -[ ! -z "${info}" ] && info="
${info}
" - -# get the system hostname -hostname="${NETDATA_HOSTNAME}" -[ -z "${hostname}" ] && hostname="${NETDATA_REGISTRY_HOSTNAME}" -[ -z "${hostname}" ] && hostname="$(hostname)" - -goto_url="${NETDATA_REGISTRY_URL}/goto-host-from-alarm.html?machine_guid=${NETDATA_REGISTRY_UNIQUE_ID}&chart=${chart}&family=${family}" - -# get the current date -date="$(date)" - -duration4human() { - local s="${1}" d=0 h=0 m=0 ds="day" hs="hour" ms="minute" ss="second" - d=$(( s / 86400 )) - s=$(( s - (d * 86400) )) - h=$(( s / 3600 )) - s=$(( s - (h * 3600) )) - m=$(( s / 60 )) - s=$(( s - (m * 60) )) - - if [ ${d} -gt 0 ] - then - [ ${m} -ge 30 ] && h=$(( h + 1 )) - [ ${d} -gt 1 ] && ds="days" - [ ${h} -gt 1 ] && hs="hours" - if [ ${h} -gt 0 ] - then - echo "${d} ${ds} and ${h} ${hs}" - else - echo "${d} ${ds}" - fi - elif [ ${h} -gt 0 ] - then - [ ${s} -ge 30 ] && m=$(( m + 1 )) - [ ${h} -gt 1 ] && hs="hours" - [ ${m} -gt 1 ] && ms="minutes" - if [ ${m} -gt 0 ] - then - echo "${h} ${hs} and ${m} ${ms}" - else - echo "${h} ${hs}" - fi - elif [ ${m} -gt 0 ] - then - [ ${m} -gt 1 ] && ms="minutes" - [ ${s} -gt 1 ] && ss="seconds" - if [ ${s} -gt 0 ] - then - echo "${m} ${ms} and ${s} ${ss}" - else - echo "${m} ${ms}" - fi - else - [ ${s} -gt 1 ] && ss="seconds" - echo "${s} ${ss}" - fi -} - -severity="${status}" -raised_for="
(was ${old_status,,} for $(duration4human ${duration}))" -status_message="status unknown" -color="grey" -alarm="${name} = ${value} ${units}" - -# prepare the title based on status -case "${status}" in - CRITICAL) - status_message="is critical" - color="#ca414b" - ;; - - WARNING) - status_message="needs attention" - color="#caca4b" - ;; - - CLEAR) - status_message="recovered" - color="#77ca6d" - - # don't show the value when the status is CLEAR - # for certain alarms, this value might not have any meaning - alarm="${name}" - ;; -esac - -if [ "${status}" != "WARNING" -a "${status}" != "CRITICAL" -a "${status}" != "CLEAR" ] -then - # don't do anything if this is not WARNING, CRITICAL or CLEAR - echo >&2 "${me}: not sending notification email for ${status} on '${chart}.${name}'" - exit 0 -elif [ "${old_status}" != "WARNING" -a "${old_status}" != "CRITICAL" -a "${status}" = "CLEAR" ] -then - # don't do anything if this is CLEAR, but it was not WARNING or CRITICAL - echo >&2 "${me}: not sending notification email for ${status} on '${chart}.${name}' (last status was ${old_status})" - exit 0 -elif [ "${status}" = "CLEAR" ] -then - severity="Recovered from ${old_status}" - if [ $non_clear_duration -gt $duration ] - then - raised_for="
(had issues for $(duration4human ${non_clear_duration}))" - fi - -elif [ "${old_status}" = "WARNING" -a "${status}" = "CRITICAL" ] -then - severity="Escalated to ${status}" - if [ $non_clear_duration -gt $duration ] - then - raised_for="
(has issues for $(duration4human ${non_clear_duration}))" - fi - -elif [ "${old_status}" = "CRITICAL" -a "${status}" = "WARNING" ] -then - severity="Demoted to ${status}" - if [ $non_clear_duration -gt $duration ] - then - raised_for="
(has issues for $(duration4human ${non_clear_duration}))" - fi - -else - raised_for= -fi - -# send the email -cat < - - - - - - - - - -
-
- - - - - - - - - - - - -
-
netdata notification
-
-

${hostname} ${status_message}

-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
- ${chart} - Chart -
- ${alarm}${info} - Alarm -
- ${family} - Family -
- ${severity} - Severity -
${date} - ${raised_for} Time -
The source of this alarm is line ${src} -
Sent by - netdata, the real-time performance monitoring. -
-
-
-
-
- - -EOF +${0/alarm-email.sh/alarm-notify.sh} "${@}" diff --git a/plugins.d/alarm-notify.sh b/plugins.d/alarm-notify.sh new file mode 100755 index 000000000..feec6ceae --- /dev/null +++ b/plugins.d/alarm-notify.sh @@ -0,0 +1,767 @@ +#!/usr/bin/env bash + +# netdata +# real-time performance and health monitoring, done right! +# (C) 2016 Costa Tsaousis +# GPL v3+ +# +# Script the send alarm notifications for netdata +# +# Features: +# - multiple notification methods +# - multiple roles per alarm +# - multiple recipients per role +# - severity filtering per recipient +# +# Supported notification methods: +# - emails +# - pushover.net notifications +# - slack.com notifications +# - telegram.org notifications +# + +me="${0}" + +# check for BASH v4+ (required for associative arrays) +[ $(( ${BASH_VERSINFO[0]} )) -lt 4 ] && \ + echo >&2 "${me}: BASH version 4 or later is required (this is ${BASH_VERSION})." && \ + exit 1 + +# defaults to allow running this script by hand +NETDATA_CONFIG_DIR="${NETDATA_CONFIG_DIR-/etc/netdata}" +NETDATA_CACHE_DIR="${NETDATA_CACHE_DIR-/var/cache/netdata}" +[ -z "${NETDATA_REGISTRY_URL}" ] && NETDATA_REGISTRY_URL="https://registry.my-netdata.io" +[ -z "${NETDATA_HOSTNAME}" ] && NETDATA_HOSTNAME="$(hostname)" +[ -z "${NETDATA_REGISTRY_HOSTNAME}" ] && NETDATA_REGISTRY_HOSTNAME="${NETDATA_HOSTNAME}" + +# ----------------------------------------------------------------------------- +# parse command line parameters + +roles="${1}" # the roles that should be notified for this event +host="${2}" # the host generated this event +unique_id="${3}" # the unique id of this event +alarm_id="${4}" # the unique id of the alarm that generated this event +event_id="${5}" # the incremental id of the event, for this alarm id +when="${6}" # the timestamp this event occurred +name="${7}" # the name of the alarm, as given in netdata health.d entries +chart="${8}" # the name of the chart (type.id) +family="${9}" # the family of the chart +status="${10}" # the current status : REMOVED, UNITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL +old_status="${11}" # the previous status: REMOVED, UNITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL +value="${12}" # the current value of the alarm +old_value="${13}" # the previous value of the alarm +src="${14}" # the line number and file the alarm has been configured +duration="${15}" # the duration in seconds of the previous alarm state +non_clear_duration="${16}" # the total duration in seconds this is/was non-clear +units="${17}" # the units of the value +info="${18}" # a short description of the alarm + +# ----------------------------------------------------------------------------- +# screen statuses we don't need to send a notification + +# don't do anything if this is not WARNING, CRITICAL or CLEAR +if [ "${status}" != "WARNING" -a "${status}" != "CRITICAL" -a "${status}" != "CLEAR" ] +then + echo >&2 "${me}: not sending notification for ${status} on '${chart}.${name}'" + exit 1 +fi + +# don't do anything if this is CLEAR, but it was not WARNING or CRITICAL +if [ "${old_status}" != "WARNING" -a "${old_status}" != "CRITICAL" -a "${status}" = "CLEAR" ] +then + echo >&2 "${me}: not sending notification for ${status} on '${chart}.${name}' (last status was ${old_status})" + exit 1 +fi + +# ----------------------------------------------------------------------------- +# load configuration + +# By default fetch images from the global public registry. +# This is required by default, since all notification methods need to download +# images via the Internet, and private registries might not be reachable. +# This can be overwritten at the configuration file. +images_base_url="https://registry.my-netdata.io" + +# needed commands +# if empty they will be searched in the system path +curl= +sendmail= + +# enable / disable features +SEND_SLACK="YES" +SEND_PUSHOVER="YES" +SEND_TELEGRAM="YES" +SEND_EMAIL="YES" + +# slack configs +SLACK_WEBHOOK_URL= +DEFAULT_RECIPIENT_SLACK= +declare -A role_recipients_slack=() + +# pushover configs +PUSHOVER_APP_TOKEN= +DEFAULT_RECIPIENT_PUSHOVER= +declare -A role_recipients_pushover=() + +# telegram configs +TELEGRAM_BOT_TOKEN= +DEFAULT_RECIPIENT_TELEGRAM= +declare -A role_recipients_telegram=() + +# email configs +DEFAULT_RECIPIENT_EMAIL="root" +declare -A role_recipients_email=() + +# load the user configuration +# this will overwrite the variables above +if [ -f "${NETDATA_CONFIG_DIR}/health_alarm_notify.conf" ] + then + source "${NETDATA_CONFIG_DIR}/health_alarm_notify.conf" +fi + +# ----------------------------------------------------------------------------- +# filter a recipient based on alarm event severity + +filter_recipient_by_criticality() { + local method="${1}" x="${2}" r s + shift + + r="${x/|*/}" # the recipient + s="${x/*|/}" # the severity required for notifying this recipient + + # no severity filtering for this person + [ "${r}" = "${s}" ] && return 0 + + # the severity is invalid + s="${s^^}" + [ "${s}" != "CRITICAL" ] && return 0 + + # the new or the old status matches the severity + if [ "${s}" = "${status}" -o "${s}" = "${old_status}" ] + then + [ ! -d "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}" ] && \ + mkdir -p "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}" + + # we need to keep track of the notifications we sent + # so that the same user will receive the recovery + # even if old_status does not match the required severity + touch "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}/${alarm_id}" + return 0 + fi + + # it is a cleared alarm we have sent notification for + if [ "${status}" != "WARNING" -a "${status}" != "CRITICAL" -a -f "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}/${alarm_id}" ] + then + rm "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}/${alarm_id}" + return 0 + fi + + return 1 +} + +# ----------------------------------------------------------------------------- +# find the recipients' addresses per method + +declare -A arr_slack=() +declare -A arr_pushover=() +declare -A arr_telegram=() +declare -A arr_email=() + +# netdata may call us with multiple roles, and roles may have multiple but +# overlapping recipients - so, here we find the unique recipients. +for x in ${roles//,/ } +do + # the roles 'silent' and 'disabled' mean: + # don't send a notification for this role + [ "${x}" = "silent" -o "${x}" = "disabled" ] && continue + + # email + a="${role_recipients_email[${x}]}" + [ -z "${a}" ] && a="${DEFAULT_RECIPIENT_EMAIL}" + for r in ${a//,/ } + do + [ "${r}" != "disabled" ] && filter_recipient_by_criticality email "${r}" && arr_email[${r/|*/}]="1" + done + + # pushover + a="${role_recipients_pushover[${x}]}" + [ -z "${a}" ] && a="${DEFAULT_RECIPIENT_PUSHOVER}" + for r in ${a//,/ } + do + [ "${r}" != "disabled" ] && filter_recipient_by_criticality pushover "${r}" && arr_pushover[${r/|*/}]="1" + done + + # telegram + a="${role_recipients_telegram[${x}]}" + [ -z "${a}" ] && a="${DEFAULT_RECIPIENT_TELEGRAM}" + for r in ${a//,/ } + do + [ "${r}" != "disabled" ] && filter_recipient_by_criticality telegram "${r}" && arr_telegram[${r/|*/}]="1" + done + + # slack + a="${role_recipients_slack[${x}]}" + [ -z "${a}" ] && a="${DEFAULT_RECIPIENT_SLACK}" + for r in ${a//,/ } + do + [ "${r}" != "disabled" ] && filter_recipient_by_criticality slack "${r}" && arr_slack[${r/|*/}]="1" + done +done + +# build the list of slack recipients (channels) +to_slack="${!arr_slack[*]}" +[ -z "${to_slack}" ] && SEND_SLACK="NO" + +# build the list of pushover recipients (user tokens) +to_pushover="${!arr_pushover[*]}" +[ -z "${to_pushover}" ] && SEND_PUSHOVER="NO" + +# check array of telegram recipients (chat ids) +to_telegram="${!arr_telegram[*]}" +[ -z "${to_telegram}" ] && SEND_TELEGRAM="NO" + +# build the list of email recipients (email addresses) +to_email= +for x in "${!arr_email[@]}" +do + [ ! -z "${to_email}" ] && to_email="${to_email}, " + to_email="${to_email}${x}" +done +[ -z "${to_email}" ] && SEND_EMAIL="NO" + + +# ----------------------------------------------------------------------------- +# verify the delivery methods supported + +# check slack +[ -z "${SLACK_WEBHOOK_URL}" ] && SEND_SLACK="NO" + +# check pushover +[ -z "${PUSHOVER_APP_TOKEN}" ] && SEND_PUSHOVER="NO" + +# check telegram +[ -z "${TELEGRAM_BOT_TOKEN}" ] && SEND_TELEGRAM="NO" + +if [ \( "${SEND_PUSHOVER}" = "YES" -o "${SEND_SLACK}" = "YES" -o "${SEND_TELEGRAM}" = "YES" \) -a -z "${curl}" ] + then + curl="$(which curl 2>/dev/null || command -v curl 2>/dev/null)" + if [ -z "${curl}" ] + then + SEND_PUSHOVER="NO" + SEND_TELEGRAM="NO" + SEND_SLACK="NO" + fi +fi + +if [ "${SEND_EMAIL}" = "YES" -a -z "${sendmail}" ] + then + sendmail="$(which sendmail 2>/dev/null || command -v sendmail 2>/dev/null)" + [ -z "${sendmail}" ] && SEND_EMAIL="NO" +fi + +# check that we have at least a method enabled +if [ "${SEND_EMAIL}" != "YES" -a "${SEND_PUSHOVER}" != "YES" -a "${SEND_TELEGRAM}" != "YES" -a "${SEND_SLACK}" != "YES" ] + then + echo >&2 "All notification methods are disabled. Not sending a notification." + exit 1 +fi + +# ----------------------------------------------------------------------------- +# get the system hostname + +[ -z "${host}" ] && host="${NETDATA_HOSTNAME}" +[ -z "${host}" ] && host="${NETDATA_REGISTRY_HOSTNAME}" +[ -z "${host}" ] && host="$(hostname 2>/dev/null)" + +# ----------------------------------------------------------------------------- +# get the date the alarm happened + +date="$(date --date=@${when} 2>/dev/null)" +[ -z "${date}" ] && date="$(date 2>/dev/null)" + +# ----------------------------------------------------------------------------- +# URL encode a string + +urlencode() { + local string="${1}" strlen encoded pos c o + + strlen=${#string} + for (( pos=0 ; pos&2 "${me}: Sent email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}'" + return 0 + else + echo >&2 "${me}: Failed to send email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}' with error code ${ret}." + return 1 + fi + fi + + return 1 +} + +# ----------------------------------------------------------------------------- +# pushover sender + +send_pushover() { + local apptoken="${1}" usertokens="${2}" when="${3}" url="${4}" status="${5}" title="${6}" message="${7}" httpcode sent=0 user priority + + if [ "${SEND_PUSHOVER}" = "YES" -a ! -z "${apptoken}" -a ! -z "${usertokens}" -a ! -z "${title}" -a ! -z "${message}" ] + then + + # https://pushover.net/api + priority=-2 + case "${status}" in + CLEAR) priority=-1;; # low priority: no sound or vibration + WARNING) priotity=0;; # normal priority: respect quiet hours + CRITICAL) priority=1;; # high priority: bypass quiet hours + *) priority=-2;; # lowest priority: no notification at all + esac + + for user in ${usertokens} + do + httpcode=$(${curl} --write-out %{http_code} --silent --output /dev/null \ + --form-string "token=${apptoken}" \ + --form-string "user=${user}" \ + --form-string "html=1" \ + --form-string "title=${title}" \ + --form-string "message=${message}" \ + --form-string "timestamp=${when}" \ + --form-string "url=${url}" \ + --form-string "url_title=Open netdata dashboard to view the alarm" \ + --form-string "priority=${priority}" \ + https://api.pushover.net/1/messages.json) + + if [ "${httpcode}" == "200" ] + then + echo >&2 "${me}: Sent pushover notification for: ${host} ${chart}.${name} is ${status} to '${user}'" + sent=$((sent + 1)) + else + echo >&2 "${me}: Failed to send pushover notification for: ${host} ${chart}.${name} is ${status} to '${user}' with HTTP error code ${httpcode}." + fi + done + + [ ${sent} -gt 0 ] && return 0 + fi + + return 1 +} + + +# ----------------------------------------------------------------------------- +# telegram sender + +send_telegram() { + local bottoken="${1}" chatids="${2}" message="${3}" httpcode sent=0 chatid disableNotification="" + + if [ "${status}" = "CLEAR" ]; then disableNotification="--data-urlencode disable_notification=true"; fi + + if [ "${SEND_TELEGRAM}" = "YES" -a ! -z "${bottoken}" -a ! -z "${chatids}" -a ! -z "${message}" ]; + then + for chatid in ${chatids} + do + # https://core.telegram.org/bots/api#sendmessage + httpcode=$(${curl} --write-out %{http_code} --silent --output /dev/null ${disableNotification} \ + --data-urlencode "parse_mode=HTML" \ + --data-urlencode "disable_web_page_preview=true" \ + --data-urlencode "text=$message" \ + "https://api.telegram.org/bot${bottoken}/sendMessage?chat_id=$chatid") + + if [ "${httpcode}" == "200" ] + then + echo >&2 "${me}: Sent telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}'" + sent=$((sent + 1)) + elif [ "${httpcode}" == "401" ] + then + echo >&2 "${me}: Failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}': Wrong bot token." + else + echo >&2 "${me}: Failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}' with HTTP error code ${httpcode}." + fi + done + + [ ${sent} -gt 0 ] && return 0 + fi + + return 1 +} + +# ----------------------------------------------------------------------------- +# slack sender + +send_slack() { + local webhook="${1}" channels="${2}" httpcode sent=0 channel color payload + + [ "${SEND_SLACK}" != "YES" ] && return 1 + + case "${status}" in + WARNING) color="warning" ;; + CRITICAL) color="danger" ;; + CLEAR) color="good" ;; + *) color="#777777" ;; + esac + + for channel in ${channels} + do + payload="$(cat <", + "ts": ${when} + } + ] + } +EOF + )" + + httpcode=$(${curl} --write-out %{http_code} --silent --output /dev/null -X POST --data-urlencode "payload=${payload}" "${webhook}") + if [ "${httpcode}" == "200" ] + then + echo >&2 "${me}: Sent slack notification for: ${host} ${chart}.${name} is ${status} to '${channel}'" + sent=$((sent + 1)) + else + echo >&2 "${me}: Failed to send slack notification for: ${host} ${chart}.${name} is ${status} to '${channel}', with HTTP error code ${httpcode}." + fi + done + + [ ${sent} -gt 0 ] && return 0 + + return 1 +} + + +# ----------------------------------------------------------------------------- +# prepare the content of the notification + +# the url to send the user on click +urlencode "${NETDATA_REGISTRY_HOSTNAME}" >/dev/null; url_host="${REPLY}" +urlencode "${chart}" >/dev/null; url_chart="${REPLY}" +urlencode "${family}" >/dev/null; url_family="${REPLY}" +urlencode "${name}" >/dev/null; url_name="${REPLY}" +goto_url="${NETDATA_REGISTRY_URL}/goto-host-from-alarm.html?host=${url_host}&chart=${url_chart}&family=${url_family}&alarm=${url_name}&alarm_unique_id=${unique_id}&alarm_id=${alarm_id}&alarm_event_id=${event_id}" + +# the severity of the alarm +severity="${status}" + +# the time the alarm was raised +duration4human ${duration} >/dev/null; duration_txt="${REPLY}" +duration4human ${non_clear_duration} >/dev/null; non_clear_duration_txt="${REPLY}" +raised_for="(was ${old_status,,} for ${duration_txt})" + +# the key status message +status_message="status unknown" + +# the color of the alarm +color="grey" + +# the alarm value +alarm="${name//_/ } = ${value} ${units}" + +# the image of the alarm +image="${images_base_url}/images/seo-performance-128.png" + +# prepare the title based on status +case "${status}" in + CRITICAL) + image="${images_base_url}/images/alert-128-red.png" + status_message="is critical" + color="#ca414b" + ;; + + WARNING) + image="${images_base_url}/images/alert-128-orange.png" + status_message="needs attention" + color="#caca4b" + ;; + + CLEAR) + image="${images_base_url}/images/check-mark-2-128-green.png" + status_message="recovered" + color="#77ca6d" + + # don't show the value when the status is CLEAR + # for certain alarms, this value might not have any meaning + alarm="${name//_/ } ${raised_for}" + ;; +esac + +if [ "${status}" = "CLEAR" ] +then + severity="Recovered from ${old_status}" + if [ $non_clear_duration -gt $duration ] + then + raised_for="(alarm was raised for ${non_clear_duration_txt})" + fi + +elif [ "${old_status}" = "WARNING" -a "${status}" = "CRITICAL" ] +then + severity="Escalated to ${status}" + if [ $non_clear_duration -gt $duration ] + then + raised_for="(alarm is raised for ${non_clear_duration_txt})" + fi + +elif [ "${old_status}" = "CRITICAL" -a "${status}" = "WARNING" ] +then + severity="Demoted to ${status}" + if [ $non_clear_duration -gt $duration ] + then + raised_for="(alarm is raised for ${non_clear_duration_txt})" + fi + +else + raised_for= +fi + +# prepare HTML versions of elements +info_html= +[ ! -z "${info}" ] && info_html="
${info}
" + +raised_for_html= +[ ! -z "${raised_for}" ] && raised_for_html="
${raised_for}" + +# ----------------------------------------------------------------------------- +# send the slack notification + +# slack aggregates posts from the same username +# so we use "${host} ${status}" as the bot username, to make them diff + +send_slack "${SLACK_WEBHOOK_URL}" "${to_slack}" +SENT_SLACK=$? + +# ----------------------------------------------------------------------------- +# send the pushover notification + +send_pushover "${PUSHOVER_APP_TOKEN}" "${to_pushover}" "${when}" "${goto_url}" "${status}" "${host} ${status_message} - ${name//_/ } - ${chart}" " +${alarm}${info_html}
  +${chart}
Chart
 
+${family}
Family
 
+${severity}
Severity
 
+${date}${raised_for_html}
Time
 
+View Netdata
  +The source of this alarm is line ${src} +" + +SENT_PUSHOVER=$? + +# ----------------------------------------------------------------------------- +# send the telegram.org message + +# https://core.telegram.org/bots/api#formatting-options +telegram_message="${severity}" +[ "${status_message}" != "recovered" ] && telegram_message="${telegram_message}, ${status_message}" +telegram_message="${telegram_message} +${chart} (${family}) +${alarm} +${info}" + +send_telegram "${TELEGRAM_BOT_TOKEN}" "${to_telegram}" "${telegram_message}" + +SENT_TELEGRAM=$? + +# ----------------------------------------------------------------------------- +# send the email + +send_email < + + + + + + + + + +
+
+ + + + + + + + + + + + +
+
netdata notification
+
+

${host} ${status_message}

+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ ${chart} + Chart +
+ ${alarm}${info_html} + Alarm +
+ ${family} + Family +
+ ${severity} + Severity +
${date} + ${raised_for_html} Time +
+ View Netdata +
The source of this alarm is line ${src} +
Sent by + netdata, the real-time performance monitoring. +
+
+
+
+
+ + +EOF + +SENT_EMAIL=$? + +# ----------------------------------------------------------------------------- +# let netdata know + +# we did send something +[ ${SENT_EMAIL} -eq 0 -o ${SENT_PUSHOVER} -eq 0 -o ${SENT_TELEGRAM} -eq 0 -o ${SENT_SLACK} -eq 0 ] && exit 0 + +# we did not send anything +exit 1 diff --git a/plugins.d/cgroup-name.sh b/plugins.d/cgroup-name.sh index 8bfc984c2..1c6f564b4 100755 --- a/plugins.d/cgroup-name.sh +++ b/plugins.d/cgroup-name.sh @@ -71,7 +71,7 @@ if [ -z "${NAME}" ] fi [ -z "${NAME}" ] && NAME="${CGROUP}" - [ ${#NAME} -gt 50 ] && NAME="${NAME:0:50}" + [ ${#NAME} -gt 100 ] && NAME="${NAME:0:100}" fi echo >&2 "${0}: cgroup '${CGROUP}' is called '${NAME}'" diff --git a/plugins.d/charts.d.plugin b/plugins.d/charts.d.plugin index 9aaadc168..df9998ece 100755 --- a/plugins.d/charts.d.plugin +++ b/plugins.d/charts.d.plugin @@ -48,10 +48,6 @@ require_cmd awk || exit 1 # insternal defaults # netdata exposes a few environment variables for us -pause_method="sleep" # use either "suspend" or "sleep" - # DO NOT USE SUSPEND - LINUX WILL SUSPEND NETDATA TOO - # THE WHOLE PROCESS GROUP - NOT JUST THE SHELL - pluginsd="${NETDATA_PLUGINS_DIR}" [ -z "$pluginsd" ] && pluginsd="$( dirname $PROGRAM_FILE )" @@ -153,6 +149,24 @@ do done +# ----------------------------------------------------------------------------- +# loop control + +# default sleep function +LOOPSLEEPMS_HIGHRES=0 +now_ms= +current_time_ms_default() { + now_ms="$(date +'%s')000" +} +current_time_ms="current_time_ms_default" +current_time_ms_accuracy=1 +mysleep="sleep" + +# if found and included, this file overwrites loopsleepms() +# and current_time_ms() with a high resolution timer function +# for precise looping. +. "$pluginsd/loopsleepms.sh.inc" + # ----------------------------------------------------------------------------- # load my configuration @@ -172,13 +186,6 @@ else echo >&2 "$PROGRAM_NAME: configuration file '$myconfig' not found. Using defaults." fi -if [ "$pause_method" = "suspend" ] -then - # enable bash job control - # this is required for suspend to work - set -m -fi - # we check for the timeout command, after we load our # configuration, so that the user may overwrite the # timeout command we use, providing a function that @@ -204,27 +211,6 @@ if [ ! -d "$chartsd" ] fi -# ----------------------------------------------------------------------------- -# loop control - -# default sleep function -LOOPSLEEPMS_HIGHRES=0 -loopsleepms() { - [ "$1" = "tellwork" ] && shift - sleep $1 -} - -now_ms= -current_time_ms() { - now_ms="$(date +'%s')000" -} - -# if found and included, this file overwrites loopsleepms() -# and current_time_ms() with a high resolution timer function -# for precise looping. -. "$pluginsd/loopsleepms.sh.inc" - - # ----------------------------------------------------------------------------- # library functions @@ -537,7 +523,7 @@ global_update() { next_ms x seconds millis # return the current time in ms in $now_ms - current_time_ms + ${current_time_ms} exit_at=$(( now_ms + (restart_timeout * 1000) )) @@ -562,16 +548,16 @@ global_update() { next_charts=() # return the current time in ms in $now_ms - current_time_ms + ${current_time_ms} for chart in "${now_charts[@]}" do - # echo >&2 "DEBUG: chart: $chart last: ${charts_last_update[$chart]}, next: ${charts_next_update[$chart]}, now: ${now_ms}" + #echo >&2 " DEBUG: chart: $chart last: ${charts_last_update[$chart]}, next: ${charts_next_update[$chart]}, now: ${now_ms}" if [ ${now_ms} -ge ${charts_next_update[$chart]} ] then last_ms=${charts_last_update[$chart]} dt=$(( (now_ms - last_ms) )) - # echo >&2 "DEBUG: chart: $chart last: ${charts_last_update[$chart]}, next: ${charts_next_update[$chart]}, now: ${now_ms}, dt: ${dt}" + #echo >&2 " DEBUG: chart: $chart last: ${charts_last_update[$chart]}, next: ${charts_next_update[$chart]}, now: ${now_ms}, dt: ${dt}" charts_last_update[$chart]=${now_ms} @@ -590,11 +576,12 @@ global_update() { fi exec_start_ms=$now_ms + #echo >&2 " EXEC: $chart$charts_update $dt" $chart$charts_update $dt ret=$? # return the current time in ms in $now_ms - current_time_ms; exec_end_ms=$now_ms + ${current_time_ms}; exec_end_ms=$now_ms echo "BEGIN netdata.plugin_chartsd_$chart $dt" echo "SET run_time = $(( exec_end_ms - exec_start_ms ))" @@ -620,27 +607,29 @@ global_update() { fi done - if [ "$pause_method" = "suspend" ] - then - echo "STOPPING_WAKE_ME_UP_PLEASE" - suspend || ( echo >&2 "$PROGRAM_NAME: suspend returned error $?, falling back to sleep."; loopsleepms $debug_time $update_every $time_divisor) - else - # wait the time you are required to - next_ms=$((now_ms + (update_every * 1000 * 100) )) - for x in "${charts_next_update[@]}"; do [ ${x} -lt ${next_ms} ] && next_ms=${x}; done - next_ms=$((next_ms - now_ms)) + # wait the time you are required to + next_ms=$((now_ms + (update_every * 1000 * 100) )) + for x in "${charts_next_update[@]}"; do [ ${x} -lt ${next_ms} ] && next_ms=${x}; done + next_ms=$((next_ms - now_ms)) - if [ ${LOOPSLEEPMS_HIGHRES} -eq 1 -a ${next_ms} -gt 0 ] + if [ ${LOOPSLEEPMS_HIGHRES} -eq 1 -a ${next_ms} -gt 0 ] + then + next_ms=$(( next_ms + current_time_ms_accuracy )) + seconds=$(( next_ms / 1000 )) + millis=$(( next_ms % 1000 )) + if [ ${millis} -lt 10 ] then - seconds=$(( next_ms / 1000 )) - millis=$(( next_ms % 1000 )) - [ ${millis} -lt 10 ] && millis="0${millis}" - [ ${millis} -lt 100 ] && millis="0${millis}" - [ $debug -eq 1 ] && echo >&2 "$PROGRAM_NAME: sleeping for ${seconds}.${millis} seconds." - sleep ${seconds}.${millis} - else - sleep $update_every + millis="00${millis}" + elif [ ${millis} -lt 100 ] + then + millis="0${millis}" fi + + [ $debug -eq 1 ] && echo >&2 "$PROGRAM_NAME: sleeping for ${seconds}.${millis} seconds." + ${mysleep} ${seconds}.${millis} + else + [ $debug -eq 1 ] && echo >&2 "$PROGRAM_NAME: sleeping for ${update_every} seconds." + ${mysleep} $update_every fi test ${now_ms} -ge ${exit_at} && exit 0 diff --git a/plugins.d/loopsleepms.sh.inc b/plugins.d/loopsleepms.sh.inc index 02ab694d2..6de93043c 100644 --- a/plugins.d/loopsleepms.sh.inc +++ b/plugins.d/loopsleepms.sh.inc @@ -1,10 +1,5 @@ # no need for shebang - this file is included from other scripts -# this function is used to sleep a fraction of a second -# it calculates the difference between every time is called -# and tries to align the sleep time to give you exactly the -# loop you need. - LOOPSLEEP_DATE="$(which date)" if [ -z "$LOOPSLEEP_DATE" ] then @@ -12,17 +7,14 @@ if [ -z "$LOOPSLEEP_DATE" ] exit 1 fi -LOOPSLEEPMS_LASTRUN=0 -LOOPSLEEPMS_LASTSLEEP=0 -LOOPSLEEPMS_LASTWORK=0 +# ----------------------------------------------------------------------------- +# use the date command as a high resolution timer +now_ms= LOOPSLEEPMS_HIGHRES=1 test "$($LOOPSLEEP_DATE +%N)" = "%N" && LOOPSLEEPMS_HIGHRES=0 - -now_ms= -current_time_ms() { - # if high resolution is not supported - # just sleep the time requested, in seconds +test -z "$($LOOPSLEEP_DATE +%N)" && LOOPSLEEPMS_HIGHRES=0 +current_time_ms_from_date() { if [ $LOOPSLEEPMS_HIGHRES -eq 0 ] then now_ms="$($LOOPSLEEP_DATE +'%s')000" @@ -31,60 +23,167 @@ current_time_ms() { fi } +# ----------------------------------------------------------------------------- +# use /proc/uptime as a high resolution timer + +current_time_ms_from_date +current_time_ms_from_uptime_started="${now_ms}" +current_time_ms_from_uptime_last="${now_ms}" +current_time_ms_from_uptime_first=0 +current_time_ms_from_uptime() { + local up rest arr=() n + + read up rest &2 "$0: Cannot read /proc/uptime - falling back to current_time_ms_from_date()." + current_time_ms="current_time_ms_from_date" + current_time_ms_from_date + current_time_ms_accuracy=1 + return + fi + + arr=(${up//./ }) + + if [ ${#arr[1]} -lt 1 ] + then + n="${arr[0]}000" + elif [ ${#arr[1]} -lt 2 ] + then + n="${arr[0]}${arr[1]}00" + elif [ ${#arr[1]} -lt 3 ] + then + n="${arr[0]}${arr[1]}0" + else + n="${arr[0]}${arr[1]}" + fi + + now_ms=$((current_time_ms_from_uptime_started - current_time_ms_from_uptime_first + n)) + + if [ "${now_ms}" -lt "${current_time_ms_from_uptime_last}" ] + then + echo >&2 "$0: Cannot use current_time_ms_from_uptime() - new time ${now_ms} is older than the last ${current_time_ms_from_uptime_last} - falling back to current_time_ms_from_date()." + current_time_ms="current_time_ms_from_date" + current_time_ms_from_date + current_time_ms_accuracy=1 + fi + + current_time_ms_from_uptime_last="${now_ms}" +} +current_time_ms_from_uptime +current_time_ms_from_uptime_first="$((now_ms - current_time_ms_from_uptime_started))" +current_time_ms_from_uptime_last="${current_time_ms_from_uptime_first}" +current_time_ms="current_time_ms_from_uptime" +current_time_ms_accuracy=10 +if [ "${current_time_ms_from_uptime_first}" -eq 0 ] + then + echo >&2 "$0: Invalid setup for current_time_ms_from_uptime() - falling back to current_time_ms_from_date()." + current_time_ms="current_time_ms_from_date" + current_time_ms_accuracy=1 +fi + +# ----------------------------------------------------------------------------- +# use read with timeout for sleep + +mysleep="mysleep_read" + +mysleep_fifo="${NETDATA_CACHE_DIR-/tmp}/.netdata_bash_sleep_timer_fifo" +[ ! -e "${mysleep_fifo}" ] && mkfifo "${mysleep_fifo}" +[ ! -e "${mysleep_fifo}" ] && mysleep="sleep" + +mysleep_read() { + read -t "${1}" <>"${mysleep_fifo}" + ret=$? + if [ $ret -le 128 ] + then + echo >&2 "$0: Cannot use read for sleeping (return code ${ret})." + mysleep="sleep" + ${mysleep} "${1}" + fi +} + + +# ----------------------------------------------------------------------------- +# this function is used to sleep a fraction of a second +# it calculates the difference between every time is called +# and tries to align the sleep time to give you exactly the +# loop you need. + +LOOPSLEEPMS_LASTRUN=0 +LOOPSLEEPMS_NEXTRUN=0 +LOOPSLEEPMS_LASTSLEEP=0 +LOOPSLEEPMS_LASTWORK=0 + loopsleepms() { - local tellwork=0 t="$1" div s m now mstosleep + local tellwork=0 t="${1}" div s m now mstosleep - if [ "$t" = "tellwork" ] + if [ "${t}" = "tellwork" ] then tellwork=1 shift - t="$1" + t="${1}" fi - div="${2-100}" # $t = the time in seconds to wait # if high resolution is not supported # just sleep the time requested, in seconds - if [ $LOOPSLEEPMS_HIGHRES -eq 0 ] + if [ ${LOOPSLEEPMS_HIGHRES} -eq 0 ] then - sleep $t + sleep ${t} return fi - # get the current time, in ms - # milliseconds since epoch (1-1-1970) - now="$(( $( $LOOPSLEEP_DATE +'%s * 1000 + %-N / 1000000' ) ))" - - # calculate required sleep in ms - t=$((t * 1000 * div / 100)) - - # this is our first run - # just wait the requested time - test $LOOPSLEEPMS_LASTRUN -eq 0 && LOOPSLEEPMS_LASTRUN=$now + # get the current time, in ms in ${now_ms} + ${current_time_ms} # calculate ms since last run - LOOPSLEEPMS_LASTWORK=$((now - LOOPSLEEPMS_LASTRUN - LOOPSLEEPMS_LASTSLEEP)) + [ ${LOOPSLEEPMS_LASTRUN} -gt 0 ] && \ + LOOPSLEEPMS_LASTWORK=$((now_ms - LOOPSLEEPMS_LASTRUN - LOOPSLEEPMS_LASTSLEEP)) # echo "# last loop's work took $LOOPSLEEPMS_LASTWORK ms" + + # remember this run + LOOPSLEEPMS_LASTRUN=${now_ms} + + # calculate the next run + LOOPSLEEPMS_NEXTRUN=$(( ( now_ms - ( now_ms % ( t * 1000 ) ) ) + ( t * 1000 ) )) # calculate ms to sleep - mstosleep=$(( t - LOOPSLEEPMS_LASTWORK )) + mstosleep=$(( LOOPSLEEPMS_NEXTRUN - now_ms + current_time_ms_accuracy )) # echo "# mstosleep is $mstosleep ms" # if we are too slow, sleep some time - test $mstosleep -lt 200 && mstosleep=200 + test ${mstosleep} -lt 200 && mstosleep=200 - s=$((mstosleep / 1000)) - m=$((mstosleep - (s * 1000))) + s=$(( mstosleep / 1000 )) + m=$(( mstosleep - (s * 1000) )) + [ "${m}" -lt 100 ] && m="0${m}" + [ "${m}" -lt 10 ] && m="0${m}" - test $tellwork -eq 1 && echo >&2 " >>> PERFORMANCE >>> WORK TOOK $LOOPSLEEPMS_LASTWORK ms ( $((LOOPSLEEPMS_LASTWORK * 100 / 1000)).$((LOOPSLEEPMS_LASTWORK % 10))% cpu ) >>> SLEEPING $mstosleep ms" + test $tellwork -eq 1 && echo >&2 " >>> PERFORMANCE >>> WORK TOOK ${LOOPSLEEPMS_LASTWORK} ms ( $((LOOPSLEEPMS_LASTWORK * 100 / 1000)).$((LOOPSLEEPMS_LASTWORK % 10))% cpu ) >>> SLEEPING ${mstosleep} ms" - # echo "# sleeping $s.$m" + # echo "# sleeping ${s}.${m}" # echo - sleep $s.$m + ${mysleep} ${s}.${m} # keep the values we need # for our next run - LOOPSLEEPMS_LASTRUN=$now LOOPSLEEPMS_LASTSLEEP=$mstosleep } + +# test it +#while [ 1 ] +#do +# r=$(( (RANDOM * 2000 / 32767) )) +# s=$((r / 1000)) +# m=$((r - (s * 1000))) +# [ "${m}" -lt 100 ] && m="0${m}" +# [ "${m}" -lt 10 ] && m="0${m}" +# echo "${r} = ${s}.${m}" +# +# # the work +# ${mysleep} ${s}.${m} +# +# # the alignment loop +# loopsleepms tellwork 1 +#done diff --git a/plugins.d/tc-qos-helper.sh b/plugins.d/tc-qos-helper.sh index bff5217d2..9caef85f7 100755 --- a/plugins.d/tc-qos-helper.sh +++ b/plugins.d/tc-qos-helper.sh @@ -27,16 +27,9 @@ if [ -f "${config_dir}/tc-qos-helper.conf" ] source "${config_dir}/tc-qos-helper.conf" fi -# default time function -now_ms= -current_time_ms() { - now_ms="$(date +'%s')000" -} - # default sleep function LOOPSLEEPMS_LASTWORK=0 loopsleepms() { - [ "$1" = "tellwork" ] && shift sleep $1 } -- cgit v1.2.3