diff options
Diffstat (limited to 'plugins.d')
-rwxr-xr-x | plugins.d/alarm-notify.sh | 172 | ||||
-rwxr-xr-x | plugins.d/cgroup-name.sh | 52 | ||||
-rwxr-xr-x | plugins.d/cgroup-network-helper.sh | 13 | ||||
-rwxr-xr-x | plugins.d/charts.d.plugin | 11 | ||||
-rwxr-xr-x | plugins.d/python.d.plugin | 6 |
5 files changed, 223 insertions, 31 deletions
diff --git a/plugins.d/alarm-notify.sh b/plugins.d/alarm-notify.sh index 0af98095d..3e23a164f 100755 --- a/plugins.d/alarm-notify.sh +++ b/plugins.d/alarm-notify.sh @@ -16,6 +16,7 @@ # Supported notification methods: # - emails by @ktsaou # - slack.com notifications by @ktsaou +# - alerta.io notifications by @kattunga # - discordapp.com notifications by @lowfive # - pushover.net notifications by @ktsaou # - pushbullet.com push notifications by Tiago Peralta @tperalta82 #1070 @@ -119,7 +120,7 @@ docurl() { echo >&2 "--- END curl command ---" local out=$(mktemp /tmp/netdata-health-alarm-notify-XXXXXXXX) - local code=$(${curl} --write-out %{http_code} --output "${out}" --silent --show-error "${@}") + local code=$(${curl} ${curl_options} --write-out %{http_code} --output "${out}" --silent --show-error "${@}") local ret=$? echo >&2 "--- BEGIN received response ---" cat >&2 "${out}" @@ -131,7 +132,7 @@ docurl() { return ${ret} fi - ${curl} --write-out %{http_code} --output /dev/null --silent --show-error "${@}" + ${curl} ${curl_options} --write-out %{http_code} --output /dev/null --silent --show-error "${@}" return $? } @@ -212,6 +213,9 @@ fi # This can be overwritten at the configuration file. images_base_url="https://registry.my-netdata.io" +# curl options to use +curl_options= + # needed commands # if empty they will be searched in the system path curl= @@ -219,6 +223,7 @@ sendmail= # enable / disable features SEND_SLACK="YES" +SEND_ALERTA="YES" SEND_FLOCK="YES" SEND_DISCORD="YES" SEND_PUSHOVER="YES" @@ -231,6 +236,7 @@ SEND_EMAIL="YES" SEND_PUSHBULLET="YES" SEND_KAFKA="YES" SEND_PD="YES" +SEND_IRC="YES" SEND_CUSTOM="YES" # slack configs @@ -238,6 +244,12 @@ SLACK_WEBHOOK_URL= DEFAULT_RECIPIENT_SLACK= declare -A role_recipients_slack=() +# alerta configs +ALERTA_WEBHOOK_URL= +ALERTA_API_KEY= +DEFAULT_RECIPIENT_ALERTA= +declare -A role_recipients_alerta=() + # flock configs FLOCK_WEBHOOK_URL= DEFAULT_RECIPIENT_FLOCK= @@ -308,6 +320,13 @@ DEFAULT_RECIPIENT_EMAIL="root" EMAIL_CHARSET=$(locale charmap 2>/dev/null) declare -A role_recipients_email=() +# irc configs +IRC_NICKNAME= +IRC_REALNAME= +DEFAULT_RECIPIENT_IRC= +IRC_NETWORK= +declare -A role_recipients_irc=() + # load the user configuration # this will overwrite the variables above if [ -f "${NETDATA_CONFIG_DIR}/health_alarm_notify.conf" ] @@ -386,6 +405,7 @@ filter_recipient_by_criticality() { # find the recipients' addresses per method declare -A arr_slack=() +declare -A arr_alerta=() declare -A arr_flock=() declare -A arr_discord=() declare -A arr_pushover=() @@ -398,6 +418,7 @@ declare -A arr_email=() declare -A arr_custom=() declare -A arr_messagebird=() declare -A arr_kavenegar=() +declare -A arr_irc=() # netdata may call us with multiple roles, and roles may have multiple but # overlapping recipients - so, here we find the unique recipients. @@ -479,6 +500,14 @@ do [ "${r}" != "disabled" ] && filter_recipient_by_criticality slack "${r}" && arr_slack[${r/|*/}]="1" done + # alerta + a="${role_recipients_alerta[${x}]}" + [ -z "${a}" ] && a="${DEFAULT_RECIPIENT_ALERTA}" + for r in ${a//,/ } + do + [ "${r}" != "disabled" ] && filter_recipient_by_criticality alerta "${r}" && arr_alerta[${r/|*/}]="1" + done + # flock a="${role_recipients_flock[${x}]}" [ -z "${a}" ] && a="${DEFAULT_RECIPIENT_FLOCK}" @@ -502,6 +531,14 @@ do do [ "${r}" != "disabled" ] && filter_recipient_by_criticality pd "${r}" && arr_pd[${r/|*/}]="1" done + + # irc + a="${role_recipients_irc[${x}]}" + [ -z "${a}" ] && a="${DEFAULT_RECIPIENT_IRC}" + for r in ${a//,/ } + do + [ "${r}" != "disabled" ] && filter_recipient_by_criticality irc "${r}" && arr_irc[${r/|*/}]="1" + done # custom a="${role_recipients_custom[${x}]}" @@ -517,6 +554,10 @@ done to_slack="${!arr_slack[*]}" [ -z "${to_slack}" ] && SEND_SLACK="NO" +# build the list of alerta recipients (channels) +to_alerta="${!arr_alerta[*]}" +[ -z "${to_alerta}" ] && SEND_ALERTA="NO" + # build the list of flock recipients (channels) to_flock="${!arr_flock[*]}" [ -z "${to_flock}" ] && SEND_FLOCK="NO" @@ -570,6 +611,9 @@ do done [ -z "${to_email}" ] && SEND_EMAIL="NO" +# build the list of irc recipients (channels) +to_irc="${!arr_irc[*]}" +[ -z "${to_irc}" ] && SEND_IRC="NO" # ----------------------------------------------------------------------------- # verify the delivery methods supported @@ -577,6 +621,9 @@ done # check slack [ -z "${SLACK_WEBHOOK_URL}" ] && SEND_SLACK="NO" +# check alerta +[ -z "${ALERTA_WEBHOOK_URL}" ] && SEND_ALERTA="NO" + # check flock [ -z "${FLOCK_WEBHOOK_URL}" ] && SEND_FLOCK="NO" @@ -607,6 +654,9 @@ done # check kafka [ -z "${KAFKA_URL}" -o -z "${KAFKA_SENDER_IP}" ] && SEND_KAFKA="NO" +# check irc +[ -z "${IRC_NETWORK}" ] && SEND_IRC="NO" + # check pagerduty.com # if we need pd-send, check for the pd-send command # https://www.pagerduty.com/docs/guides/agent-install-guide/ @@ -624,6 +674,7 @@ fi if [ \( \ "${SEND_PUSHOVER}" = "YES" \ -o "${SEND_SLACK}" = "YES" \ + -o "${SEND_ALERTA}" = "YES" \ -o "${SEND_FLOCK}" = "YES" \ -o "${SEND_DISCORD}" = "YES" \ -o "${SEND_HIPCHAT}" = "YES" \ @@ -644,6 +695,7 @@ if [ \( \ SEND_PUSHBULLET="NO" SEND_TELEGRAM="NO" SEND_SLACK="NO" + SEND_ALERTA="NO" SEND_FLOCK="NO" SEND_DISCORD="NO" SEND_TWILIO="NO" @@ -671,6 +723,7 @@ if [ "${SEND_EMAIL}" != "YES" \ -a "${SEND_PUSHOVER}" != "YES" \ -a "${SEND_TELEGRAM}" != "YES" \ -a "${SEND_SLACK}" != "YES" \ + -a "${SEND_ALERTA}" != "YES" \ -a "${SEND_FLOCK}" != "YES" \ -a "${SEND_DISCORD}" != "YES" \ -a "${SEND_TWILIO}" != "YES" \ @@ -681,6 +734,7 @@ if [ "${SEND_EMAIL}" != "YES" \ -a "${SEND_KAFKA}" != "YES" \ -a "${SEND_PD}" != "YES" \ -a "${SEND_CUSTOM}" != "YES" \ + -a "${SEND_IRC}" != "YES" \ ] then fatal "All notification methods are disabled. Not sending notification for host '${host}', chart '${chart}' to '${roles}' for '${name}' = '${value}' for status '${status}'." @@ -954,7 +1008,7 @@ send_pd() { ${pd_send} -k ${PD_SERVICE_KEY} \ -t ${t} \ -d "${d}" \ - -i ${alarm_id} \ + -i ${host}:${chart}:${name} \ -f 'info'="${info}" \ -f 'value_w_units'="${value_string}" \ -f 'when'="${when}" \ @@ -1029,6 +1083,10 @@ send_twilio() { send_hipchat() { local authtoken="${1}" recipients="${2}" message="${3}" httpcode sent=0 room color sender msg_format notify + # remove <small></small> from the message + message="${message//<small>/}" + message="${message//<\/small>/}" + if [ "${SEND_HIPCHAT}" = "YES" -a ! -z "${HIPCHAT_SERVER}" -a ! -z "${authtoken}" -a ! -z "${recipients}" -a ! -z "${message}" ] then # A label to be shown in addition to the sender's name @@ -1248,6 +1306,53 @@ EOF } # ----------------------------------------------------------------------------- +# alerta sender + +send_alerta() { + local webhook="${1}" channels="${2}" httpcode sent=0 channel severity content + + [ "${SEND_ALERTA}" != "YES" ] && return 1 + + case "${status}" in + WARNING) severity="warning" ;; + CRITICAL) severity="critical" ;; + CLEAR) severity="cleared" ;; + *) severity="unknown" ;; + esac + + info=$( echo -n ${info}) + + # the "event" property must be unique and repetible between states to let alerta do automatic correlation using severity value + for channel in ${channels} + do + content="{" + content="$content \"environment\": \"${channel}\"," + content="$content \"service\": [\"${host}\"]," + content="$content \"resource\": \"${host}\"," + content="$content \"event\": \"${name}.${chart} (${family})\"," + content="$content \"severity\": \"${severity}\"," + content="$content \"value\": \"${alarm}\"," + content="$content \"text\": \"${info}\"" + content="$content }" + + + httpcode=$(docurl -X POST "${webhook}/alert" -H "Content-Type: application/json" -H "Authorization: Key $ALERTA_API_KEY" -d "$content" ) + + if [[ "${httpcode}" = "200" || "${httpcode}" = "201" ]] + then + info "sent alerta notification for: ${host} ${chart}.${name} is ${status} to '${channel}'" + sent=$((sent + 1)) + else + error "failed to send alerta notification for: ${host} ${chart}.${name} is ${status} to '${channel}', with HTTP error code ${httpcode}." + fi + done + + [ ${sent} -gt 0 ] && return 0 + + return 1 +} + +# ----------------------------------------------------------------------------- # flock sender send_flock() { @@ -1365,6 +1470,46 @@ EOF return 1 } +# ----------------------------------------------------------------------------- +# irc sender + +send_irc() { + local NICKNAME="${1}" REALNAME="${2}" CHANNELS="${3}" NETWORK="${4}" SERVERNAME="${5}" MESSAGE="${6}" sent=0 channel color send_alarm reply_codes error + + if [ "${SEND_IRC}" = "YES" -a ! -z "${NICKNAME}" -a ! -z "${REALNAME}" -a ! -z "${CHANNELS}" -a ! -z "${NETWORK}" -a ! -z "${SERVERNAME}" ] + then + case "${status}" in + WARNING) color="warning" ;; + CRITICAL) color="danger" ;; + CLEAR) color="good" ;; + *) color="#777777" ;; + esac + + for CHANNEL in ${CHANNELS} + do + error=0 + send_alarm=$(echo -e "USER ${NICKNAME} guest ${REALNAME} ${SERVERNAME}\nNICK ${NICKNAME}\nJOIN ${CHANNEL}\nPRIVMSG ${CHANNEL} :${MESSAGE}\nQUIT\n" \ | nc ${NETWORK} 6667) + reply_codes=$(echo ${send_alarm} | cut -d ' ' -f 2 | grep -o '[0-9]*') + for code in ${reply_codes} + do + [ "${code}" -ge 400 -a "${code}" -le 599 ] && error=1 && break + done + + if [ "${error}" -eq 0 ] + then + info "sent irc notification for: ${host} ${chart}.${name} is ${status} to '${CHANNEL}'" + sent=$((sent + 1)) + else + error "failed to send irc notification for: ${host} ${chart}.${name} is ${status} to '${CHANNEL}', with error code ${code}." + fi + done + fi + + [ ${sent} -gt 0 ] && return 0 + + return 1 +} + # ----------------------------------------------------------------------------- # prepare the content of the notification @@ -1466,6 +1611,15 @@ send_slack "${SLACK_WEBHOOK_URL}" "${to_slack}" SENT_SLACK=$? # ----------------------------------------------------------------------------- +# send the alerta notification + +# alerta aggregates posts from the same username +# so we use "${host} ${status}" as the bot username, to make them diff + +send_alerta "${ALERTA_WEBHOOK_URL}" "${to_alerta}" +SENT_ALERTA=$? + +# ----------------------------------------------------------------------------- # send the flock notification # flock aggregates posts from the same username @@ -1570,6 +1724,16 @@ SENT_KAFKA=$? send_pd "${to_pd}" SENT_PD=$? +# ----------------------------------------------------------------------------- +# send the irc message + +send_irc "${IRC_NICKNAME}" "${IRC_REALNAME}" "${to_irc}" "${IRC_NETWORK}" "${host}" "${host} ${status_message} - ${name//_/ } - ${chart} ----- ${alarm} +Severity: ${severity} +Chart: ${chart} +Family: ${family} +${info}" + +SENT_IRC=$? # ----------------------------------------------------------------------------- # send the custom message @@ -1733,6 +1897,7 @@ if [ ${SENT_EMAIL} -eq 0 \ -o ${SENT_PUSHOVER} -eq 0 \ -o ${SENT_TELEGRAM} -eq 0 \ -o ${SENT_SLACK} -eq 0 \ + -o ${SENT_ALERTA} -eq 0 \ -o ${SENT_FLOCK} -eq 0 \ -o ${SENT_DISCORD} -eq 0 \ -o ${SENT_TWILIO} -eq 0 \ @@ -1742,6 +1907,7 @@ if [ ${SENT_EMAIL} -eq 0 \ -o ${SENT_PUSHBULLET} -eq 0 \ -o ${SENT_KAFKA} -eq 0 \ -o ${SENT_PD} -eq 0 \ + -o ${SENT_IRC} -eq 0 \ -o ${SENT_CUSTOM} -eq 0 \ ] then diff --git a/plugins.d/cgroup-name.sh b/plugins.d/cgroup-name.sh index acdd6f4f9..3c8ad7205 100755 --- a/plugins.d/cgroup-name.sh +++ b/plugins.d/cgroup-name.sh @@ -94,6 +94,23 @@ function docker_get_name_api { return 0 } +function docker_get_name { + local id="${1}" + if hash docker 2>/dev/null + then + docker_get_name_classic "${id}" + else + docker_get_name_api "${id}" || docker_get_name_classic "${id}" + fi + if [ -z "${NAME}" ] + then + warning "cannot find the name of docker container '${id}'" + NAME="${id:0:12}" + else + info "docker container '${id}' is named '${NAME}'" + fi +} + if [ -z "${NAME}" ] then if [[ "${CGROUP}" =~ ^.*docker[-_/\.][a-fA-F0-9]+[-_\.]?.*$ ]] @@ -105,20 +122,29 @@ if [ -z "${NAME}" ] if [ ! -z "${DOCKERID}" -a \( ${#DOCKERID} -eq 64 -o ${#DOCKERID} -eq 12 \) ] then - if hash docker 2>/dev/null - then - docker_get_name_classic ${DOCKERID} - else - docker_get_name_api ${DOCKERID} || docker_get_name_classic ${DOCKERID} - fi - if [ -z "${NAME}" ] - then - warning "cannot find the name of docker container '${DOCKERID}'" - NAME="${DOCKERID:0:12}" - else - info "docker container '${DOCKERID}' is named '${NAME}'" - fi + docker_get_name "${DOCKERID}" + else + error "a docker id cannot be extracted from docker cgroup '${CGROUP}'." + fi + elif [[ "${CGROUP}" =~ ^.*kubepods[_/].*[_/]pod[a-fA-F0-9-]+[_/][a-fA-F0-9]+$ ]] + then + # kubernetes + + DOCKERID="$( echo "${CGROUP}" | sed "s|^.*kubepods[_/].*[_/]pod[a-fA-F0-9-]\+[_/]\([a-fA-F0-9]\+\)$|\1|" )" + # echo "DOCKERID=${DOCKERID}" + + if [ ! -z "${DOCKERID}" -a \( ${#DOCKERID} -eq 64 -o ${#DOCKERID} -eq 12 \) ] + then + docker_get_name "${DOCKERID}" + else + error "a docker id cannot be extracted from kubernetes cgroup '${CGROUP}'." fi + elif [[ "${CGROUP}" =~ machine.slice[_/].*\.service ]] + then + # systemd-nspawn + + NAME="$(echo ${CGROUP} | sed 's/.*machine.slice[_\/]\(.*\)\.service/\1/g')" + elif [[ "${CGROUP}" =~ machine.slice_machine.*-qemu ]] then # libvirtd / qemu virtual machines diff --git a/plugins.d/cgroup-network-helper.sh b/plugins.d/cgroup-network-helper.sh index d93fe356a..f07059986 100755 --- a/plugins.d/cgroup-network-helper.sh +++ b/plugins.d/cgroup-network-helper.sh @@ -22,7 +22,9 @@ # ----------------------------------------------------------------------------- -export PATH="${PATH}:/sbin:/usr/sbin:/usr/local/sbin" +# the system path is cleared by cgroup-network +[ -f /etc/profile ] && source /etc/profile + export LC_ALL=C PROGRAM_NAME="$(basename "${0}")" @@ -68,13 +70,6 @@ debug() { fatal "BASH version 4 or later is required (this is ${BASH_VERSION})." # ----------------------------------------------------------------------------- -# defaults to allow running this script by hand - -[ -z "${NETDATA_PLUGINS_DIR}" ] && NETDATA_PLUGINS_DIR="$(dirname "${0}")" -[ -z "${NETDATA_CONFIG_DIR}" ] && NETDATA_CONFIG_DIR="$(dirname "${0}")/../../../../etc/netdata" -[ -z "${NETDATA_CACHE_DIR}" ] && NETDATA_CACHE_DIR="$(dirname "${0}")/../../../../var/cache/netdata" - -# ----------------------------------------------------------------------------- # parse the arguments pid= @@ -172,7 +167,7 @@ virsh_find_all_interfaces_for_cgroup() { # match only 'network' interfaces from virsh output set_source "virsh" - "${virsh}" domiflist ${d} |\ + "${virsh}" -r domiflist ${d} |\ sed -n \ -e "s|^\([^[:space:]]\+\)[[:space:]]\+network[[:space:]]\+\([^[:space:]]\+\)[[:space:]]\+[^[:space:]]\+[[:space:]]\+[^[:space:]]\+$|\1 \1_\2|p" \ -e "s|^\([^[:space:]]\+\)[[:space:]]\+bridge[[:space:]]\+\([^[:space:]]\+\)[[:space:]]\+[^[:space:]]\+[[:space:]]\+[^[:space:]]\+$|\1 \1_\2|p" diff --git a/plugins.d/charts.d.plugin b/plugins.d/charts.d.plugin index c36a0cde3..9bd03fd47 100755 --- a/plugins.d/charts.d.plugin +++ b/plugins.d/charts.d.plugin @@ -477,6 +477,7 @@ all_enabled_charts() { # ----------------------------------------------------------------------------- # load the charts +suffix_retries="_retries" suffix_update_every="_update_every" active_charts= for chart in $( all_enabled_charts ) @@ -582,7 +583,7 @@ debug "run_charts='$run_charts'" [ -z "$run_charts" ] && fatal "No charts to collect data from." -declare -A charts_last_update=() charts_update_every=() charts_next_update=() charts_run_counter=() charts_serial_failures=() +declare -A charts_last_update=() charts_update_every=() charts_retries=() charts_next_update=() charts_run_counter=() charts_serial_failures=() global_update() { local exit_at \ c=0 dt ret last_ms exec_start_ms exec_end_ms \ @@ -597,7 +598,11 @@ global_update() { for chart in $run_charts do eval "charts_update_every[$chart]=\$$chart$suffix_update_every" - test -z "${charts_update_every[$chart]}" && charts_update_every[$charts]=$update_every + test -z "${charts_update_every[$chart]}" && charts_update_every[$chart]=$update_every + + eval "charts_retries[$chart]=\$$chart$suffix_retries" + test -z "${charts_retries[$chart]}" && charts_retries[$chart]=10 + charts_last_update[$chart]=$((now_ms - (now_ms % (charts_update_every[$chart] * 1000) ) )) charts_next_update[$chart]=$(( charts_last_update[$chart] + (charts_update_every[$chart] * 1000) )) charts_run_counter[$chart]=0 @@ -660,7 +665,7 @@ global_update() { else charts_serial_failures[$chart]=$(( charts_serial_failures[$chart] + 1 )) - if [ ${charts_serial_failures[$chart]} -gt 10 ] + if [ ${charts_serial_failures[$chart]} -gt ${charts_retries[$chart]} ] then error "module's '$chart' update() function reported failure ${charts_serial_failures[$chart]} times. Disabling it." else diff --git a/plugins.d/python.d.plugin b/plugins.d/python.d.plugin index 855080e81..c9b260164 100755 --- a/plugins.d/python.d.plugin +++ b/plugins.d/python.d.plugin @@ -196,7 +196,7 @@ class Plugin(object): self.runs_counter = 0 self.config, error = self.loader.load_config_from_file(PLUGIN_CONFIG_DIR + 'python.d.conf') if error: - run_and_exit(Logger.error)(error) + Logger.error('"python.d.conf" configuration file not found. Using defaults.') if not self.config.get('enabled', True): run_and_exit(Logger.info)('DISABLED in configuration file.') @@ -316,10 +316,10 @@ class Plugin(object): job.checked = True continue if not job.is_autodetect() or ok is None: - job.error('check() => [FAILED]') + job.info('check() => [FAILED]') self.delete_job(job) else: - job.error('check() => [RECHECK] (autodetection_retry: {0})'.format(job.recheck_every)) + job.info('check() => [RECHECK] (autodetection_retry: {0})'.format(job.recheck_every)) def run_create(self): for job in self.jobs: |