From 2e85f9325a797977eea9dfea0a925775ddd211d9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Feb 2021 12:49:00 +0100 Subject: Merging upstream version 1.29.0. Signed-off-by: Daniel Baumann --- health/.keep | 0 health/Makefile.am | 11 +- health/Makefile.in | 875 -------- health/QUICKSTART.md | 143 ++ health/README.md | 736 +------ health/REFERENCE.md | 797 ++++++++ health/health.c | 132 +- health/health.d/anomalies.conf | 17 + health/health.d/apps_plugin.conf | 15 + health/health.d/backend.conf | 11 + health/health.d/cockroachdb.conf | 91 + health/health.d/dbengine.conf | 26 +- health/health.d/dns_query.conf | 12 + health/health.d/elasticsearch.conf | 7 +- health/health.d/exporting.conf | 34 + health/health.d/mdstat.conf | 7 +- health/health.d/megacli.conf | 6 +- health/health.d/mysql.conf | 2 +- health/health.d/net.conf | 29 +- health/health.d/portcheck.conf | 4 +- health/health.d/processes.conf | 26 +- health/health.d/pulsar.conf | 13 + health/health.d/ram.conf | 4 +- health/health.d/scaleio.conf | 38 + health/health.d/softnet.conf | 2 +- health/health.d/unbound.conf | 35 + health/health.d/vernemq.conf | 399 ++++ health/health.d/web_log.conf | 2 - health/health.d/whoisquery.conf | 24 + health/health.d/x509check.conf | 8 + health/health.h | 10 +- health/health_config.c | 65 +- health/health_json.c | 83 +- health/health_log.c | 71 +- health/notifications/Makefile.am | 3 + health/notifications/Makefile.in | 835 -------- health/notifications/README.md | 22 +- health/notifications/alarm-notify.sh | 2428 ---------------------- health/notifications/alarm-notify.sh.in | 2701 ++++++++++++++----------- health/notifications/alerta/README.md | 10 +- health/notifications/awssns/README.md | 12 +- health/notifications/custom/README.md | 7 +- health/notifications/discord/README.md | 5 + health/notifications/dynatrace/Makefile.inc | 12 + health/notifications/dynatrace/README.md | 36 + health/notifications/email/README.md | 7 +- health/notifications/flock/README.md | 7 +- health/notifications/hangouts/README.md | 38 +- health/notifications/health_alarm_notify.conf | 133 +- health/notifications/irc/README.md | 7 +- health/notifications/kavenegar/README.md | 9 +- health/notifications/matrix/Makefile.inc | 12 + health/notifications/matrix/README.md | 58 + health/notifications/messagebird/README.md | 5 + health/notifications/opsgenie/Makefile.inc | 12 + health/notifications/opsgenie/README.md | 59 + health/notifications/pagerduty/README.md | 9 + health/notifications/prowl/Makefile.inc | 12 + health/notifications/prowl/README.md | 29 + health/notifications/pushbullet/README.md | 5 + health/notifications/pushover/README.md | 5 + health/notifications/rocketchat/README.md | 7 +- health/notifications/slack/README.md | 5 + health/notifications/smstools3/README.md | 5 + health/notifications/stackpulse/Makefile.inc | 12 + health/notifications/stackpulse/README.md | 80 + health/notifications/syslog/README.md | 5 + health/notifications/telegram/README.md | 7 +- health/notifications/twilio/README.md | 5 + health/notifications/web/README.md | 5 + 70 files changed, 4177 insertions(+), 6167 deletions(-) delete mode 100644 health/.keep delete mode 100644 health/Makefile.in create mode 100644 health/QUICKSTART.md create mode 100644 health/REFERENCE.md create mode 100644 health/health.d/anomalies.conf create mode 100644 health/health.d/apps_plugin.conf create mode 100644 health/health.d/cockroachdb.conf create mode 100644 health/health.d/dns_query.conf create mode 100644 health/health.d/exporting.conf create mode 100644 health/health.d/pulsar.conf create mode 100644 health/health.d/scaleio.conf create mode 100644 health/health.d/unbound.conf create mode 100644 health/health.d/vernemq.conf create mode 100644 health/health.d/whoisquery.conf delete mode 100644 health/notifications/Makefile.in delete mode 100644 health/notifications/alarm-notify.sh create mode 100644 health/notifications/dynatrace/Makefile.inc create mode 100644 health/notifications/dynatrace/README.md create mode 100644 health/notifications/matrix/Makefile.inc create mode 100644 health/notifications/matrix/README.md create mode 100644 health/notifications/opsgenie/Makefile.inc create mode 100644 health/notifications/opsgenie/README.md create mode 100644 health/notifications/prowl/Makefile.inc create mode 100644 health/notifications/prowl/README.md create mode 100644 health/notifications/stackpulse/Makefile.inc create mode 100644 health/notifications/stackpulse/README.md (limited to 'health') diff --git a/health/.keep b/health/.keep deleted file mode 100644 index e69de29bb..000000000 diff --git a/health/Makefile.am b/health/Makefile.am index f63faa8af..399d6df5a 100644 --- a/health/Makefile.am +++ b/health/Makefile.am @@ -16,7 +16,6 @@ dist_noinst_DATA = \ userhealthconfigdir=$(configdir)/health.d dist_userhealthconfig_DATA = \ - .keep \ $(NULL) # Explicitly install directories to avoid permission issues due to umask @@ -27,8 +26,10 @@ healthconfigdir=$(libconfigdir)/health.d dist_healthconfig_DATA = \ health.d/adaptec_raid.conf \ health.d/am2320.conf \ + health.d/anomalies.conf \ health.d/apache.conf \ health.d/apcupsd.conf \ + health.d/apps_plugin.conf \ health.d/backend.conf \ health.d/bcache.conf \ health.d/beanstalkd.conf \ @@ -38,12 +39,15 @@ dist_healthconfig_DATA = \ health.d/ceph.conf \ health.d/cgroups.conf \ health.d/cpu.conf \ + health.d/cockroachdb.conf \ health.d/couchdb.conf \ health.d/disks.conf \ health.d/dnsmasq_dhcp.conf \ + health.d/dns_query.conf \ health.d/dockerd.conf \ health.d/elasticsearch.conf \ health.d/entropy.conf \ + health.d/exporting.conf \ health.d/fping.conf \ health.d/ioping.conf \ health.d/fronius.conf \ @@ -75,11 +79,13 @@ dist_healthconfig_DATA = \ health.d/portcheck.conf \ health.d/postgres.conf \ health.d/processes.conf \ + health.d/pulsar.conf \ health.d/qos.conf \ health.d/ram.conf \ health.d/redis.conf \ health.d/retroshare.conf \ health.d/riakkv.conf \ + health.d/scaleio.conf \ health.d/softnet.conf \ health.d/squid.conf \ health.d/stiebeleltron.conf \ @@ -90,10 +96,13 @@ dist_healthconfig_DATA = \ health.d/tcp_orphans.conf \ health.d/tcp_resets.conf \ health.d/udp_errors.conf \ + health.d/unbound.conf \ health.d/varnish.conf \ health.d/vcsa.conf \ + health.d/vernemq.conf \ health.d/vsphere.conf \ health.d/web_log.conf \ + health.d/whoisquery.conf \ health.d/wmi.conf \ health.d/x509check.conf \ health.d/zfs.conf \ diff --git a/health/Makefile.in b/health/Makefile.in deleted file mode 100644 index fe598dcad..000000000 --- a/health/Makefile.in +++ /dev/null @@ -1,875 +0,0 @@ -# Makefile.in generated by automake 1.15.1 from Makefile.am. -# @configure_input@ - -# Copyright (C) 1994-2017 Free Software Foundation, Inc. - -# This Makefile.in is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY, to the extent permitted by law; without -# even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. - -@SET_MAKE@ - -# SPDX-License-Identifier: GPL-3.0-or-later - -VPATH = @srcdir@ -am__is_gnu_make = { \ - if test -z '$(MAKELEVEL)'; then \ - false; \ - elif test -n '$(MAKE_HOST)'; then \ - true; \ - elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ - true; \ - else \ - false; \ - fi; \ -} -am__make_running_with_option = \ - case $${target_option-} in \ - ?) ;; \ - *) echo "am__make_running_with_option: internal error: invalid" \ - "target option '$${target_option-}' specified" >&2; \ - exit 1;; \ - esac; \ - has_opt=no; \ - sane_makeflags=$$MAKEFLAGS; \ - if $(am__is_gnu_make); then \ - sane_makeflags=$$MFLAGS; \ - else \ - case $$MAKEFLAGS in \ - *\\[\ \ ]*) \ - bs=\\; \ - sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ - | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ - esac; \ - fi; \ - skip_next=no; \ - strip_trailopt () \ - { \ - flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ - }; \ - for flg in $$sane_makeflags; do \ - test $$skip_next = yes && { skip_next=no; continue; }; \ - case $$flg in \ - *=*|--*) continue;; \ - -*I) strip_trailopt 'I'; skip_next=yes;; \ - -*I?*) strip_trailopt 'I';; \ - -*O) strip_trailopt 'O'; skip_next=yes;; \ - -*O?*) strip_trailopt 'O';; \ - -*l) strip_trailopt 'l'; skip_next=yes;; \ - -*l?*) strip_trailopt 'l';; \ - -[dEDm]) skip_next=yes;; \ - -[JT]) skip_next=yes;; \ - esac; \ - case $$flg in \ - *$$target_option*) has_opt=yes; break;; \ - esac; \ - done; \ - test $$has_opt = yes -am__make_dryrun = (target_option=n; $(am__make_running_with_option)) -am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) -pkgdatadir = $(datadir)/@PACKAGE@ -pkgincludedir = $(includedir)/@PACKAGE@ -pkglibdir = $(libdir)/@PACKAGE@ -pkglibexecdir = $(libexecdir)/@PACKAGE@ -am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd -install_sh_DATA = $(install_sh) -c -m 644 -install_sh_PROGRAM = $(install_sh) -c -install_sh_SCRIPT = $(install_sh) -c -INSTALL_HEADER = $(INSTALL_DATA) -transform = $(program_transform_name) -NORMAL_INSTALL = : -PRE_INSTALL = : -POST_INSTALL = : -NORMAL_UNINSTALL = : -PRE_UNINSTALL = : -POST_UNINSTALL = : -build_triplet = @build@ -host_triplet = @host@ -subdir = health -ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 -am__aclocal_m4_deps = $(top_srcdir)/build/m4/ax_c___atomic.m4 \ - $(top_srcdir)/build/m4/ax_c__generic.m4 \ - $(top_srcdir)/build/m4/ax_c_lto.m4 \ - $(top_srcdir)/build/m4/ax_c_mallinfo.m4 \ - $(top_srcdir)/build/m4/ax_c_mallopt.m4 \ - $(top_srcdir)/build/m4/ax_check_compile_flag.m4 \ - $(top_srcdir)/build/m4/ax_gcc_func_attribute.m4 \ - $(top_srcdir)/build/m4/ax_pthread.m4 \ - $(top_srcdir)/build/m4/jemalloc.m4 \ - $(top_srcdir)/build/m4/tcmalloc.m4 $(top_srcdir)/configure.ac -am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ - $(ACLOCAL_M4) -DIST_COMMON = $(srcdir)/Makefile.am $(dist_healthconfig_DATA) \ - $(dist_noinst_DATA) $(dist_userhealthconfig_DATA) \ - $(am__DIST_COMMON) -mkinstalldirs = $(install_sh) -d -CONFIG_HEADER = $(top_builddir)/config.h -CONFIG_CLEAN_FILES = -CONFIG_CLEAN_VPATH_FILES = -AM_V_P = $(am__v_P_@AM_V@) -am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) -am__v_P_0 = false -am__v_P_1 = : -AM_V_GEN = $(am__v_GEN_@AM_V@) -am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) -am__v_GEN_0 = @echo " GEN " $@; -am__v_GEN_1 = -AM_V_at = $(am__v_at_@AM_V@) -am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) -am__v_at_0 = @ -am__v_at_1 = -SOURCES = -DIST_SOURCES = -RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \ - ctags-recursive dvi-recursive html-recursive info-recursive \ - install-data-recursive install-dvi-recursive \ - install-exec-recursive install-html-recursive \ - install-info-recursive install-pdf-recursive \ - install-ps-recursive install-recursive installcheck-recursive \ - installdirs-recursive pdf-recursive ps-recursive \ - tags-recursive uninstall-recursive -am__can_run_installinfo = \ - case $$AM_UPDATE_INFO_DIR in \ - n|no|NO) false;; \ - *) (install-info --version) >/dev/null 2>&1;; \ - esac -am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; -am__vpath_adj = case $$p in \ - $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ - *) f=$$p;; \ - esac; -am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; -am__install_max = 40 -am__nobase_strip_setup = \ - srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` -am__nobase_strip = \ - for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" -am__nobase_list = $(am__nobase_strip_setup); \ - for p in $$list; do echo "$$p $$p"; done | \ - sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ - $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ - if (++n[$$2] == $(am__install_max)) \ - { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ - END { for (dir in files) print dir, files[dir] }' -am__base_list = \ - sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ - sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' -am__uninstall_files_from_dir = { \ - test -z "$$files" \ - || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ - || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ - $(am__cd) "$$dir" && rm -f $$files; }; \ - } -am__installdirs = "$(DESTDIR)$(healthconfigdir)" \ - "$(DESTDIR)$(userhealthconfigdir)" -DATA = $(dist_healthconfig_DATA) $(dist_noinst_DATA) \ - $(dist_userhealthconfig_DATA) -RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \ - distclean-recursive maintainer-clean-recursive -am__recursive_targets = \ - $(RECURSIVE_TARGETS) \ - $(RECURSIVE_CLEAN_TARGETS) \ - $(am__extra_recursive_targets) -AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \ - distdir -am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) -# Read a list of newline-separated strings from the standard input, -# and print each of them once, without duplicates. Input order is -# *not* preserved. -am__uniquify_input = $(AWK) '\ - BEGIN { nonempty = 0; } \ - { items[$$0] = 1; nonempty = 1; } \ - END { if (nonempty) { for (i in items) print i; }; } \ -' -# Make sure the list of sources is unique. This is necessary because, -# e.g., the same source file might be shared among _SOURCES variables -# for different programs/libraries. -am__define_uniq_tagged_files = \ - list='$(am__tagged_files)'; \ - unique=`for i in $$list; do \ - if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ - done | $(am__uniquify_input)` -ETAGS = etags -CTAGS = ctags -DIST_SUBDIRS = $(SUBDIRS) -am__DIST_COMMON = $(srcdir)/Makefile.in -DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) -am__relativize = \ - dir0=`pwd`; \ - sed_first='s,^\([^/]*\)/.*$$,\1,'; \ - sed_rest='s,^[^/]*/*,,'; \ - sed_last='s,^.*/\([^/]*\)$$,\1,'; \ - sed_butlast='s,/*[^/]*$$,,'; \ - while test -n "$$dir1"; do \ - first=`echo "$$dir1" | sed -e "$$sed_first"`; \ - if test "$$first" != "."; then \ - if test "$$first" = ".."; then \ - dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \ - dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \ - else \ - first2=`echo "$$dir2" | sed -e "$$sed_first"`; \ - if test "$$first2" = "$$first"; then \ - dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \ - else \ - dir2="../$$dir2"; \ - fi; \ - dir0="$$dir0"/"$$first"; \ - fi; \ - fi; \ - dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \ - done; \ - reldir="$$dir2" -ACLOCAL = @ACLOCAL@ -AMTAR = @AMTAR@ -AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ -AUTOCONF = @AUTOCONF@ -AUTOHEADER = @AUTOHEADER@ -AUTOMAKE = @AUTOMAKE@ -AWK = @AWK@ -CC = @CC@ -CCDEPMODE = @CCDEPMODE@ -CFLAGS = @CFLAGS@ -CMOCKA_CFLAGS = @CMOCKA_CFLAGS@ -CMOCKA_LIBS = @CMOCKA_LIBS@ -CPP = @CPP@ -CPPFLAGS = @CPPFLAGS@ -CUPSCONFIG = @CUPSCONFIG@ -CXX = @CXX@ -CXXDEPMODE = @CXXDEPMODE@ -CXXFLAGS = @CXXFLAGS@ -CXX_BINARY = @CXX_BINARY@ -CYGPATH_W = @CYGPATH_W@ -DEFS = @DEFS@ -DEPDIR = @DEPDIR@ -ECHO_C = @ECHO_C@ -ECHO_N = @ECHO_N@ -ECHO_T = @ECHO_T@ -EGREP = @EGREP@ -ENABLE_UNITTESTS = @ENABLE_UNITTESTS@ -EXEEXT = @EXEEXT@ -GREP = @GREP@ -INSTALL = @INSTALL@ -INSTALL_DATA = @INSTALL_DATA@ -INSTALL_PROGRAM = @INSTALL_PROGRAM@ -INSTALL_SCRIPT = @INSTALL_SCRIPT@ -INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ -IPMIMONITORING_CFLAGS = @IPMIMONITORING_CFLAGS@ -IPMIMONITORING_LIBS = @IPMIMONITORING_LIBS@ -JSON_CFLAGS = @JSON_CFLAGS@ -JSON_LIBS = @JSON_LIBS@ -LDFLAGS = @LDFLAGS@ -LIBCAP_CFLAGS = @LIBCAP_CFLAGS@ -LIBCAP_LIBS = @LIBCAP_LIBS@ -LIBCRYPTO_CFLAGS = @LIBCRYPTO_CFLAGS@ -LIBCRYPTO_LIBS = @LIBCRYPTO_LIBS@ -LIBCURL_CFLAGS = @LIBCURL_CFLAGS@ -LIBCURL_LIBS = @LIBCURL_LIBS@ -LIBMNL_CFLAGS = @LIBMNL_CFLAGS@ -LIBMNL_LIBS = @LIBMNL_LIBS@ -LIBMONGOC_CFLAGS = @LIBMONGOC_CFLAGS@ -LIBMONGOC_LIBS = @LIBMONGOC_LIBS@ -LIBOBJS = @LIBOBJS@ -LIBS = @LIBS@ -LIBSSL_CFLAGS = @LIBSSL_CFLAGS@ -LIBSSL_LIBS = @LIBSSL_LIBS@ -LTLIBOBJS = @LTLIBOBJS@ -MAINT = @MAINT@ -MAKEINFO = @MAKEINFO@ -MATH_CFLAGS = @MATH_CFLAGS@ -MATH_LIBS = @MATH_LIBS@ -MKDIR_P = @MKDIR_P@ -NFACCT_CFLAGS = @NFACCT_CFLAGS@ -NFACCT_LIBS = @NFACCT_LIBS@ -OBJEXT = @OBJEXT@ -OPTIONAL_CUPS_CFLAGS = @OPTIONAL_CUPS_CFLAGS@ -OPTIONAL_CUPS_LIBS = @OPTIONAL_CUPS_LIBS@ -OPTIONAL_IPMIMONITORING_CFLAGS = @OPTIONAL_IPMIMONITORING_CFLAGS@ -OPTIONAL_IPMIMONITORING_LIBS = @OPTIONAL_IPMIMONITORING_LIBS@ -OPTIONAL_JSONC_LIBS = @OPTIONAL_JSONC_LIBS@ -OPTIONAL_JUDY_LIBS = @OPTIONAL_JUDY_LIBS@ -OPTIONAL_KINESIS_CFLAGS = @OPTIONAL_KINESIS_CFLAGS@ -OPTIONAL_KINESIS_LIBS = @OPTIONAL_KINESIS_LIBS@ -OPTIONAL_LIBCAP_CFLAGS = @OPTIONAL_LIBCAP_CFLAGS@ -OPTIONAL_LIBCAP_LIBS = @OPTIONAL_LIBCAP_LIBS@ -OPTIONAL_LZ4_LIBS = @OPTIONAL_LZ4_LIBS@ -OPTIONAL_MATH_CFLAGS = @OPTIONAL_MATH_CFLAGS@ -OPTIONAL_MATH_LIBS = @OPTIONAL_MATH_LIBS@ -OPTIONAL_MONGOC_CFLAGS = @OPTIONAL_MONGOC_CFLAGS@ -OPTIONAL_MONGOC_LIBS = @OPTIONAL_MONGOC_LIBS@ -OPTIONAL_NFACCT_CFLAGS = @OPTIONAL_NFACCT_CFLAGS@ -OPTIONAL_NFACCT_LIBS = @OPTIONAL_NFACCT_LIBS@ -OPTIONAL_PROMETHEUS_REMOTE_WRITE_CFLAGS = @OPTIONAL_PROMETHEUS_REMOTE_WRITE_CFLAGS@ -OPTIONAL_PROMETHEUS_REMOTE_WRITE_LIBS = @OPTIONAL_PROMETHEUS_REMOTE_WRITE_LIBS@ -OPTIONAL_SSL_LIBS = @OPTIONAL_SSL_LIBS@ -OPTIONAL_UUID_CFLAGS = @OPTIONAL_UUID_CFLAGS@ -OPTIONAL_UUID_LIBS = @OPTIONAL_UUID_LIBS@ -OPTIONAL_UV_LIBS = @OPTIONAL_UV_LIBS@ -OPTIONAL_XENSTAT_CFLAGS = @OPTIONAL_XENSTAT_CFLAGS@ -OPTIONAL_XENSTAT_LIBS = @OPTIONAL_XENSTAT_LIBS@ -OPTIONAL_ZLIB_CFLAGS = @OPTIONAL_ZLIB_CFLAGS@ -OPTIONAL_ZLIB_LIBS = @OPTIONAL_ZLIB_LIBS@ -PACKAGE = @PACKAGE@ -PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ -PACKAGE_NAME = @PACKAGE_NAME@ -PACKAGE_RPM_VERSION = @PACKAGE_RPM_VERSION@ -PACKAGE_STRING = @PACKAGE_STRING@ -PACKAGE_TARNAME = @PACKAGE_TARNAME@ -PACKAGE_URL = @PACKAGE_URL@ -PACKAGE_VERSION = @PACKAGE_VERSION@ -PATH_SEPARATOR = @PATH_SEPARATOR@ -PKG_CONFIG = @PKG_CONFIG@ -PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@ -PKG_CONFIG_PATH = @PKG_CONFIG_PATH@ -PROTOBUF_CFLAGS = @PROTOBUF_CFLAGS@ -PROTOBUF_LIBS = @PROTOBUF_LIBS@ -PROTOC = @PROTOC@ -PTHREAD_CC = @PTHREAD_CC@ -PTHREAD_CFLAGS = @PTHREAD_CFLAGS@ -PTHREAD_LIBS = @PTHREAD_LIBS@ -SET_MAKE = @SET_MAKE@ -SHELL = @SHELL@ -SSE_CANDIDATE = @SSE_CANDIDATE@ -STRIP = @STRIP@ -TEST_CFLAGS = @TEST_CFLAGS@ -TEST_LIBS = @TEST_LIBS@ -UUID_CFLAGS = @UUID_CFLAGS@ -UUID_LIBS = @UUID_LIBS@ -VERSION = @VERSION@ -XENLIGHT_CFLAGS = @XENLIGHT_CFLAGS@ -XENLIGHT_LIBS = @XENLIGHT_LIBS@ -YAJL_CFLAGS = @YAJL_CFLAGS@ -YAJL_LIBS = @YAJL_LIBS@ -ZLIB_CFLAGS = @ZLIB_CFLAGS@ -ZLIB_LIBS = @ZLIB_LIBS@ -abs_builddir = @abs_builddir@ -abs_srcdir = @abs_srcdir@ -abs_top_builddir = @abs_top_builddir@ -abs_top_srcdir = @abs_top_srcdir@ -ac_ct_CC = @ac_ct_CC@ -ac_ct_CXX = @ac_ct_CXX@ -am__include = @am__include@ -am__leading_dot = @am__leading_dot@ -am__quote = @am__quote@ -am__tar = @am__tar@ -am__untar = @am__untar@ -ax_pthread_config = @ax_pthread_config@ -bindir = @bindir@ -build = @build@ -build_alias = @build_alias@ -build_cpu = @build_cpu@ -build_os = @build_os@ -build_target = @build_target@ -build_vendor = @build_vendor@ -builddir = @builddir@ -cachedir = @cachedir@ -chartsdir = @chartsdir@ -configdir = @configdir@ -datadir = @datadir@ -datarootdir = @datarootdir@ -docdir = @docdir@ -dvidir = @dvidir@ -exec_prefix = @exec_prefix@ -has_jemalloc = @has_jemalloc@ -has_tcmalloc = @has_tcmalloc@ -host = @host@ -host_alias = @host_alias@ -host_cpu = @host_cpu@ -host_os = @host_os@ -host_vendor = @host_vendor@ -htmldir = @htmldir@ -includedir = @includedir@ -infodir = @infodir@ -install_sh = @install_sh@ -libconfigdir = @libconfigdir@ -libdir = @libdir@ -libexecdir = @libexecdir@ -localedir = @localedir@ -localstatedir = @localstatedir@ -logdir = @logdir@ -mandir = @mandir@ -mkdir_p = @mkdir_p@ -nodedir = @nodedir@ -oldincludedir = @oldincludedir@ -pdfdir = @pdfdir@ -pluginsdir = @pluginsdir@ -prefix = @prefix@ -program_transform_name = @program_transform_name@ -psdir = @psdir@ -pythondir = @pythondir@ -registrydir = @registrydir@ -runstatedir = @runstatedir@ -sbindir = @sbindir@ -sharedstatedir = @sharedstatedir@ -srcdir = @srcdir@ -sysconfdir = @sysconfdir@ -target_alias = @target_alias@ -top_build_prefix = @top_build_prefix@ -top_builddir = @top_builddir@ -top_srcdir = @top_srcdir@ -varlibdir = @varlibdir@ -webdir = @webdir@ -AUTOMAKE_OPTIONS = subdir-objects -MAINTAINERCLEANFILES = $(srcdir)/Makefile.in -SUBDIRS = \ - notifications \ - $(NULL) - -CLEANFILES = \ - $(NULL) - -dist_noinst_DATA = \ - README.md \ - $(NULL) - -userhealthconfigdir = $(configdir)/health.d -dist_userhealthconfig_DATA = \ - .keep \ - $(NULL) - -healthconfigdir = $(libconfigdir)/health.d -dist_healthconfig_DATA = \ - health.d/adaptec_raid.conf \ - health.d/am2320.conf \ - health.d/apache.conf \ - health.d/apcupsd.conf \ - health.d/backend.conf \ - health.d/bcache.conf \ - health.d/beanstalkd.conf \ - health.d/bind_rndc.conf \ - health.d/boinc.conf \ - health.d/btrfs.conf \ - health.d/ceph.conf \ - health.d/cgroups.conf \ - health.d/cpu.conf \ - health.d/couchdb.conf \ - health.d/disks.conf \ - health.d/dnsmasq_dhcp.conf \ - health.d/dockerd.conf \ - health.d/elasticsearch.conf \ - health.d/entropy.conf \ - health.d/fping.conf \ - health.d/ioping.conf \ - health.d/fronius.conf \ - health.d/gearman.conf \ - health.d/haproxy.conf \ - health.d/hdfs.conf \ - health.d/httpcheck.conf \ - health.d/ipc.conf \ - health.d/ipfs.conf \ - health.d/ipmi.conf \ - health.d/isc_dhcpd.conf \ - health.d/kubelet.conf \ - health.d/lighttpd.conf \ - health.d/linux_power_supply.conf \ - health.d/load.conf \ - health.d/mdstat.conf \ - health.d/megacli.conf \ - health.d/memcached.conf \ - health.d/memory.conf \ - health.d/mongodb.conf \ - health.d/mysql.conf \ - health.d/named.conf \ - health.d/net.conf \ - health.d/netfilter.conf \ - health.d/nginx.conf \ - health.d/nginx_plus.conf \ - health.d/pihole.conf \ - health.d/phpfpm.conf \ - health.d/portcheck.conf \ - health.d/postgres.conf \ - health.d/processes.conf \ - health.d/qos.conf \ - health.d/ram.conf \ - health.d/redis.conf \ - health.d/retroshare.conf \ - health.d/riakkv.conf \ - health.d/softnet.conf \ - health.d/squid.conf \ - health.d/stiebeleltron.conf \ - health.d/swap.conf \ - health.d/tcp_conn.conf \ - health.d/tcp_listen.conf \ - health.d/tcp_mem.conf \ - health.d/tcp_orphans.conf \ - health.d/tcp_resets.conf \ - health.d/udp_errors.conf \ - health.d/varnish.conf \ - health.d/vcsa.conf \ - health.d/vsphere.conf \ - health.d/web_log.conf \ - health.d/wmi.conf \ - health.d/x509check.conf \ - health.d/zfs.conf \ - health.d/zookeeper.conf \ - health.d/dbengine.conf \ - $(NULL) - -all: all-recursive - -.SUFFIXES: -$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps) - @for dep in $?; do \ - case '$(am__configure_deps)' in \ - *$$dep*) \ - ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ - && { if test -f $@; then exit 0; else break; fi; }; \ - exit 1;; \ - esac; \ - done; \ - echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu health/Makefile'; \ - $(am__cd) $(top_srcdir) && \ - $(AUTOMAKE) --gnu health/Makefile -Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status - @case '$?' in \ - *config.status*) \ - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ - *) \ - echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ - cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ - esac; - -$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh - -$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh -$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps) - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh -$(am__aclocal_m4_deps): -install-dist_healthconfigDATA: $(dist_healthconfig_DATA) - @$(NORMAL_INSTALL) - @list='$(dist_healthconfig_DATA)'; test -n "$(healthconfigdir)" || list=; \ - if test -n "$$list"; then \ - echo " $(MKDIR_P) '$(DESTDIR)$(healthconfigdir)'"; \ - $(MKDIR_P) "$(DESTDIR)$(healthconfigdir)" || exit 1; \ - fi; \ - for p in $$list; do \ - if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ - echo "$$d$$p"; \ - done | $(am__base_list) | \ - while read files; do \ - echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(healthconfigdir)'"; \ - $(INSTALL_DATA) $$files "$(DESTDIR)$(healthconfigdir)" || exit $$?; \ - done - -uninstall-dist_healthconfigDATA: - @$(NORMAL_UNINSTALL) - @list='$(dist_healthconfig_DATA)'; test -n "$(healthconfigdir)" || list=; \ - files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ - dir='$(DESTDIR)$(healthconfigdir)'; $(am__uninstall_files_from_dir) -install-dist_userhealthconfigDATA: $(dist_userhealthconfig_DATA) - @$(NORMAL_INSTALL) - @list='$(dist_userhealthconfig_DATA)'; test -n "$(userhealthconfigdir)" || list=; \ - if test -n "$$list"; then \ - echo " $(MKDIR_P) '$(DESTDIR)$(userhealthconfigdir)'"; \ - $(MKDIR_P) "$(DESTDIR)$(userhealthconfigdir)" || exit 1; \ - fi; \ - for p in $$list; do \ - if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ - echo "$$d$$p"; \ - done | $(am__base_list) | \ - while read files; do \ - echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(userhealthconfigdir)'"; \ - $(INSTALL_DATA) $$files "$(DESTDIR)$(userhealthconfigdir)" || exit $$?; \ - done - -uninstall-dist_userhealthconfigDATA: - @$(NORMAL_UNINSTALL) - @list='$(dist_userhealthconfig_DATA)'; test -n "$(userhealthconfigdir)" || list=; \ - files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ - dir='$(DESTDIR)$(userhealthconfigdir)'; $(am__uninstall_files_from_dir) - -# This directory's subdirectories are mostly independent; you can cd -# into them and run 'make' without going through this Makefile. -# To change the values of 'make' variables: instead of editing Makefiles, -# (1) if the variable is set in 'config.status', edit 'config.status' -# (which will cause the Makefiles to be regenerated when you run 'make'); -# (2) otherwise, pass the desired values on the 'make' command line. -$(am__recursive_targets): - @fail=; \ - if $(am__make_keepgoing); then \ - failcom='fail=yes'; \ - else \ - failcom='exit 1'; \ - fi; \ - dot_seen=no; \ - target=`echo $@ | sed s/-recursive//`; \ - case "$@" in \ - distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \ - *) list='$(SUBDIRS)' ;; \ - esac; \ - for subdir in $$list; do \ - echo "Making $$target in $$subdir"; \ - if test "$$subdir" = "."; then \ - dot_seen=yes; \ - local_target="$$target-am"; \ - else \ - local_target="$$target"; \ - fi; \ - ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ - || eval $$failcom; \ - done; \ - if test "$$dot_seen" = "no"; then \ - $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \ - fi; test -z "$$fail" - -ID: $(am__tagged_files) - $(am__define_uniq_tagged_files); mkid -fID $$unique -tags: tags-recursive -TAGS: tags - -tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) - set x; \ - here=`pwd`; \ - if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \ - include_option=--etags-include; \ - empty_fix=.; \ - else \ - include_option=--include; \ - empty_fix=; \ - fi; \ - list='$(SUBDIRS)'; for subdir in $$list; do \ - if test "$$subdir" = .; then :; else \ - test ! -f $$subdir/TAGS || \ - set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \ - fi; \ - done; \ - $(am__define_uniq_tagged_files); \ - shift; \ - if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ - test -n "$$unique" || unique=$$empty_fix; \ - if test $$# -gt 0; then \ - $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ - "$$@" $$unique; \ - else \ - $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ - $$unique; \ - fi; \ - fi -ctags: ctags-recursive - -CTAGS: ctags -ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) - $(am__define_uniq_tagged_files); \ - test -z "$(CTAGS_ARGS)$$unique" \ - || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ - $$unique - -GTAGS: - here=`$(am__cd) $(top_builddir) && pwd` \ - && $(am__cd) $(top_srcdir) \ - && gtags -i $(GTAGS_ARGS) "$$here" -cscopelist: cscopelist-recursive - -cscopelist-am: $(am__tagged_files) - list='$(am__tagged_files)'; \ - case "$(srcdir)" in \ - [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ - *) sdir=$(subdir)/$(srcdir) ;; \ - esac; \ - for i in $$list; do \ - if test -f "$$i"; then \ - echo "$(subdir)/$$i"; \ - else \ - echo "$$sdir/$$i"; \ - fi; \ - done >> $(top_builddir)/cscope.files - -distclean-tags: - -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags - -distdir: $(DISTFILES) - @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ - topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ - list='$(DISTFILES)'; \ - dist_files=`for file in $$list; do echo $$file; done | \ - sed -e "s|^$$srcdirstrip/||;t" \ - -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ - case $$dist_files in \ - */*) $(MKDIR_P) `echo "$$dist_files" | \ - sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ - sort -u` ;; \ - esac; \ - for file in $$dist_files; do \ - if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ - if test -d $$d/$$file; then \ - dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ - if test -d "$(distdir)/$$file"; then \ - find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ - fi; \ - if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ - cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ - find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ - fi; \ - cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ - else \ - test -f "$(distdir)/$$file" \ - || cp -p $$d/$$file "$(distdir)/$$file" \ - || exit 1; \ - fi; \ - done - @list='$(DIST_SUBDIRS)'; for subdir in $$list; do \ - if test "$$subdir" = .; then :; else \ - $(am__make_dryrun) \ - || test -d "$(distdir)/$$subdir" \ - || $(MKDIR_P) "$(distdir)/$$subdir" \ - || exit 1; \ - dir1=$$subdir; dir2="$(distdir)/$$subdir"; \ - $(am__relativize); \ - new_distdir=$$reldir; \ - dir1=$$subdir; dir2="$(top_distdir)"; \ - $(am__relativize); \ - new_top_distdir=$$reldir; \ - echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \ - echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \ - ($(am__cd) $$subdir && \ - $(MAKE) $(AM_MAKEFLAGS) \ - top_distdir="$$new_top_distdir" \ - distdir="$$new_distdir" \ - am__remove_distdir=: \ - am__skip_length_check=: \ - am__skip_mode_fix=: \ - distdir) \ - || exit 1; \ - fi; \ - done -check-am: all-am -check: check-recursive -all-am: Makefile $(DATA) -installdirs: installdirs-recursive -installdirs-am: - for dir in "$(DESTDIR)$(healthconfigdir)" "$(DESTDIR)$(userhealthconfigdir)"; do \ - test -z "$$dir" || $(MKDIR_P) "$$dir"; \ - done -install: install-recursive -install-exec: install-exec-recursive -install-data: install-data-recursive -uninstall: uninstall-recursive - -install-am: all-am - @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am - -installcheck: installcheck-recursive -install-strip: - if test -z '$(STRIP)'; then \ - $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ - install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ - install; \ - else \ - $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ - install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ - "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ - fi -mostlyclean-generic: - -clean-generic: - -test -z "$(CLEANFILES)" || rm -f $(CLEANFILES) - -distclean-generic: - -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) - -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) - -maintainer-clean-generic: - @echo "This command is intended for maintainers to use" - @echo "it deletes files that may require special tools to rebuild." - -test -z "$(MAINTAINERCLEANFILES)" || rm -f $(MAINTAINERCLEANFILES) -clean: clean-recursive - -clean-am: clean-generic mostlyclean-am - -distclean: distclean-recursive - -rm -f Makefile -distclean-am: clean-am distclean-generic distclean-tags - -dvi: dvi-recursive - -dvi-am: - -html: html-recursive - -html-am: - -info: info-recursive - -info-am: - -install-data-am: install-dist_healthconfigDATA \ - install-dist_userhealthconfigDATA - -install-dvi: install-dvi-recursive - -install-dvi-am: - -install-exec-am: install-exec-local - -install-html: install-html-recursive - -install-html-am: - -install-info: install-info-recursive - -install-info-am: - -install-man: - -install-pdf: install-pdf-recursive - -install-pdf-am: - -install-ps: install-ps-recursive - -install-ps-am: - -installcheck-am: - -maintainer-clean: maintainer-clean-recursive - -rm -f Makefile -maintainer-clean-am: distclean-am maintainer-clean-generic - -mostlyclean: mostlyclean-recursive - -mostlyclean-am: mostlyclean-generic - -pdf: pdf-recursive - -pdf-am: - -ps: ps-recursive - -ps-am: - -uninstall-am: uninstall-dist_healthconfigDATA \ - uninstall-dist_userhealthconfigDATA - -.MAKE: $(am__recursive_targets) install-am install-strip - -.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am check \ - check-am clean clean-generic cscopelist-am ctags ctags-am \ - distclean distclean-generic distclean-tags distdir dvi dvi-am \ - html html-am info info-am install install-am install-data \ - install-data-am install-dist_healthconfigDATA \ - install-dist_userhealthconfigDATA install-dvi install-dvi-am \ - install-exec install-exec-am install-exec-local install-html \ - install-html-am install-info install-info-am install-man \ - install-pdf install-pdf-am install-ps install-ps-am \ - install-strip installcheck installcheck-am installdirs \ - installdirs-am maintainer-clean maintainer-clean-generic \ - mostlyclean mostlyclean-generic pdf pdf-am ps ps-am tags \ - tags-am uninstall uninstall-am uninstall-dist_healthconfigDATA \ - uninstall-dist_userhealthconfigDATA - -.PRECIOUS: Makefile - - -# Explicitly install directories to avoid permission issues due to umask -install-exec-local: - $(INSTALL) -d $(DESTDIR)$(userhealthconfigdir) - -# Tell versions [3.59,3.63) of GNU make to not export all variables. -# Otherwise a system limit (for SysV at least) may be exceeded. -.NOEXPORT: diff --git a/health/QUICKSTART.md b/health/QUICKSTART.md new file mode 100644 index 000000000..bc91caf7c --- /dev/null +++ b/health/QUICKSTART.md @@ -0,0 +1,143 @@ + + +# Health quickstart + +In this quickstart guide, you'll learn the basics of editing health configuration files. With this knowledge, you +will be able to customize how and when Netdata triggers alarms based on the health and performance of your system or +infrastructure. + +To learn about more advanced health configurations, visit the [health reference guide](/health/REFERENCE.md). + +## Edit health configuration files + +You should [use `edit-config`](/docs/configure/nodes.md) to edit Netdata's health configuration files. `edit-config` +will open your system's default terminal editor for you to make your changes. Once you've saved and closed the editor, +`edit-config` will copy your edited file into `/etc/netdata/health.d/`, which will override the stock file in +`/usr/lib/netdata/conf.d/health.d/` and ensure your customizations are persistent between updates. + +For example, to edit the `cpu.conf` health configuration file, you would run: + +```bash +cd /etc/netdata/ # Replace with your Netdata configuration directory, if not /etc/netdata/ +./edit-config health.d/cpu.conf +``` + +Each health configuration file contains one or more health entities, which always begin with an `alarm:` or `template:` +line. You can edit these entities based on your needs. To make any changes live, be sure to [reload your health +configuration](#reload-health-configuration). + +## Reference Netdata's stock health configuration files + +While you should always [use `edit-config`](#edit-health-configuration-files), you might also want to view the stock +health configuration files Netdata ships with. Stock files can be useful as reference material, or to determine which +file you should edit with `edit-config`. + +By default, Netdata will put health configuration files in `/usr/lib/netdata/conf.d/health.d`. However, you can +double-check the location of these files by navigating to `http://NODE:19999/netdata.conf`, replacing `NODE` with the IP +address or hostname for your Agent dashboard, looking for the `stock health configuration directory` option. The value +here will show the correct path for your installation. + +```conf +[health] + ... + # stock health configuration directory = /usr/lib/netdata/conf.d/health.d +``` + +Navigate to the health configuration directory to see all the available files and open them for reading. + +```bash +cd /usr/lib/netdata/conf.d/health.d/ +ls +adaptec_raid.conf entropy.conf memory.conf squid.conf +am2320.conf fping.conf mongodb.conf stiebeleltron.conf +apache.conf fronius.conf mysql.conf swap.conf +... +``` + +> ⚠️ If you edit configuration files in your stock health configuration directory, Netdata will overwrite them during +> any updates. Please use `edit-config` as described in the [section above](#edit-health-configuration-files). + +## Write a new health entity + +While tuning existing alarms may work in some cases, you may need to write entirely new health entities based on how +your systems and applications work. + +To write a new health entity, let's create a new file inside of the `health.d/` directory. We'll name our file +`example.conf` for now. + +```bash +./edit-config health.d/example.conf +``` + +As an example, let's build a health entity that triggers an alarm your system's RAM usage goes above 80%. Copy and paste +the following into the editor: + +```yaml + alarm: ram_usage + on: system.ram +lookup: average -1m percentage of used + units: % + every: 1m + warn: $this > 80 + crit: $this > 90 + info: The percentage of RAM used by the system. +``` + +Let's look into each of the lines to see how they create a working health entity. + +- `alarm`: The name for your new entity. The name needs to follow these requirements: + - Any alphabet letter or number. + - The symbols `.` and `_`. + - Cannot be `chart name`, `dimension name`, `family name`, or `chart variable names`. +- `on`: Which chart the entity listens to. +- `lookup`: Which metrics the alarm monitors, the duration of time to monitor, and how to process the metrics into a + usable format. + - `average`: Calculate the average of all the metrics collected. + - `-1m`: Use metrics from 1 minute ago until now to calculate that average. + - `percentage`: Clarify that we're calculating a percentage of RAM usage. + - `of used`: Specify which dimension (`used`) on the `system.ram` chart you want to monitor with this entity. +- `units`: Use percentages rather than absolute units. +- `every`: How often to perform the `lookup` calculation to decide whether or not to trigger this alarm. +- `warn`/`crit`: The value at which Netdata should trigger a warning or critical alarm. +- `info`: A description of the alarm, which will appear in the dashboard and notifications. + +Let's put all these lines into a human-readable format. + +This health entity, named **ram_usage**, watches at the **system.ram** chart. It looks up the last **1 minute** of +metrics from the **used** dimension and calculates the **average** of all those metrics in a **percentage** format, +using a **% unit**. The entity performs this lookup **every minute**. If the average RAM usage percentage over the last +1 minute is **more than 80%**, the entity triggers a warning alarm. If the usage is **more than 90%**, the entity +triggers a critical alarm. + +Now that you've written a new health entity, you need to reload it to see it live on the dashboard. + +## Reload health configuration + +To make any changes to your health configuration live, you must reload Netdata's health monitoring system. To do that +without restarting all of Netdata, run the following: + +```bash +netdatacli reload-health +``` + +If you receive an error like `command not found`, this means that `netdatacli` is not installed in your `$PATH`. In that + case, you can reload only the health component by sending a `SIGUSR2` to Netdata: + +```bash +killall -USR2 netdata +``` +## What's next? + +To learn about all of Netdata's health configuration options, view the [reference guide](/health/REFERENCE.md) and +[daemon configuration](/daemon/config/README.md#health-section-options) for additional options available in the +`[health]` section of `netdata.conf`. + +Or, get guided insights into specific health configurations with our [health guides](/health/README.md#guides). + +Finally, move on to Netdata's [notification system](/health/notifications/README.md) to learn more about how Netdata can +let you know when the health of your systems or apps goes awry. + +[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2Fquickstart%2F&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/health/README.md b/health/README.md index 0ffbbdb51..37f09e848 100644 --- a/health/README.md +++ b/health/README.md @@ -1,726 +1,38 @@ -# Health monitoring - -Each Netdata node runs an independent thread evaluating health monitoring checks. -This thread has lock free access to the database, so that it can operate as a watchdog. - -Health checks (alarms) are attached to Netdata charts, allowing Netdata to automatically -activate an alarm as soon as a chart is created. This is very important for -netdata, since many charts are dynamically created during runtime (for example, the -chart tracking network interface packet drops, is automatically created on the first -packet dropped). - -Netdata also supports alarm **templates**, so that an alarm can be attached to all the charts of the same context (i.e. all network interfaces, or all disks, or all mysql servers, etc.). - -Each alarm can execute a single query to the database using statistical algorithms against past data, -but alarms can be combined. So, if you need 2 queries in the database, you can combine -2 alarms together (both will run a query to the database, and the results can be combined). - -Each alarm has unlimited access to all the metrics collected. So, a single alarm can -use expressions combining the latest value of any number of metrics. - -## Health configuration reference - -Stock Netdata health configuration is in `/usr/lib/netdata/conf.d/health.d`. -These files can be overwritten by copying them and editing them in `/etc/netdata/health.d` -(run `/etc/netdata/edit-config` to edit them). - -In `/etc/netdata/health.d` you can also put any number of files (in any number of sub-directories) -with a suffix `.conf` to have them processed by Netdata. - -Health configuration can be reloaded at any time, without restarting Netdata. -Just send Netdata the SIGUSR2 signal, like this: - -```sh -killall -USR2 netdata -``` - -### Entities in the health files - -There are 2 entities: - -1. **alarms**, which are attached to specific charts, and - -2. **templates**, which define rules that should be applied to all charts having a - specific `context`. You can use this feature to apply **alarms** to all disks, - all network interfaces, all mysql databases, all nginx web servers, etc. - -Both of these entities have exactly the same format and feature set. -The only difference is the label `alarm` or `template`. - -Netdata supports overriding **templates** with **alarms**. -For example, when a template is defined for a set of charts, an alarm with exactly the -same name attached to the same chart the template matches, will have higher precedence -(i.e. Netdata will use the alarm on this chart and prevent the template from being applied -to it). - -### The format - -The following lines are parsed. - -#### Alarm line `alarm` or `template` - -This line starts an alarm or alarm template. - -``` -alarm: NAME -``` - -or - -``` -template: NAME -``` - -This line has to be first on each alarm or template. -`NAME` is anything you would like to name it (the only symbols allowed are `.` and `_`). - ---- - -#### Alarm line `on` - -This line defines the data the alarm should be attached to. - -For alarms: - -``` -on: CHART -``` - -For `CHART` you can use a chart `id` or `name` of the chart, as shown on the dashboard. - -For alarm templates: - -``` -on: CONTEXT -``` - -`CONTEXT` is the template of a chart. For example the charts `mysql_local.net` and -`mysql_server2.net` have the same context: `mysql.net`. So, you can use this to apply -alarms to all `mysql.net` charts. - -To find the `CONTEXT` of a chart hover over its date, above the legend. A tooltip will -appear with this format `plugin:nodule, context`. For example, the bandwidth chart of -a network interface says: - -``` -proc:/proc/dev/dev, net.net -``` - -So, `plugin = proc`, `module = /proc/net/dev` and `context = net.net`. - ---- - -#### Alarm line `os` - -This alarm or template will be used only if the O/S of the host loading it, matches this -pattern list. The value is a space separated list of simple patterns (use `*` as wildcard, -prefix with `!` for a negative match, order is important). - -``` -os: linux freebsd macos -``` - ---- - -#### Alarm line `hosts` - -This alarm or template will be used only if the hostname of the host loading it, matches -this pattern list. The value is a space separated list of simple patterns (use `*` as wildcard, -prefix with `!` for a negative match, order is important). - -``` -hosts: server1 server2 database* !redis3 redis* -``` - -The above says: use this alarm on all hosts named `server1`, `server2`, `database*`, and -all `redis*` except `redis3`. - -This is useful when you centralize metrics from multiple hosts, to one Netdata. - ---- - -#### Alarm line `families` - -This line is only used in alarm templates. It filters the charts. So, if you need to create -an alarm template for a few of a kind of chart (a few of your disks, or a few of your network -interfaces, or a few your mysql servers, etc), you can create an alarm template that would -normally be applied to all of them, and filter them by [family](../web/README.md#families). - -The format is: - -``` -families: SIMPLE PATTERN LIST -``` - -The simple pattern syntax and operation is explained in [simple patterns](../libnetdata/simple_pattern/). - ---- - -#### Alarm line `lookup` - -This line makes a database lookup to find a value. This result of this lookup is available as `$this`. - -The format is: - -``` -lookup: METHOD AFTER [at BEFORE] [every DURATION] [OPTIONS] [of DIMENSIONS] [foreach DIMENSIONS] -``` - -Everything is the same with [badges](../web/api/badges/). In short: - -- `METHOD` is one of `average`, `min`, `max`, `sum`, `incremental-sum`. - This is required. - -- `AFTER` is a relative number of seconds, but it also accepts a single letter for changing - the units, like `-1s` = 1 second in the past, `-1m` = 1 minute in the past, `-1h` = 1 hour - in the past, `-1d` = 1 day in the past. You need a negative number (i.e. how far in the past - to look for the value). **This is required**. - -- `at BEFORE` is by default 0 and is not required. Using this you can define the end of the - lookup. So data will be evaluated between `AFTER` and `BEFORE`. - -- `every DURATION` sets the updated frequency of the lookup (supports single letter units as - above too). - -- `OPTIONS` is a space separated list of `percentage`, `absolute`, `min2max`, `unaligned`, - `match-ids`, `match-names`. Check the badges documentation for more info. - -- `of DIMENSIONS` is optional and has to be the last parameter. Dimensions have to be separated - by `,` or `|`. The space characters found in dimensions will be kept as-is (a few dimensions - have spaces in their names). This accepts Netdata simple patterns and the `match-ids` and - `match-names` options affect the searches for dimensions. - -- `foreach DIMENSIONS` is optional, will always be the last parameter, and uses the same `,`/`|` - rules as the `of` parameter. Each dimension you specify in `foreach` will use the same rule - to trigger an alarm. If you set both `of` and `foreach`, Netdata will ignore the `of` parameter - and replace it with one of the dimensions you gave to `foreach`. - -The result of the lookup will be available as `$this` and `$NAME` in expressions. -The timestamps of the timeframe evaluated by the database lookup is available as variables -`$after` and `$before` (both are unix timestamps). - ---- - -#### Alarm line `calc` - -This expression is evaluated just after the `lookup` (if any). Its purpose is to apply some -calculation before using the value looked up from the db. - -You can also have an expression without a lookup, using other variables that are available. - -The result of the calculation will be available as `$this` in warning and critical expressions -(overwriting the `lookup` one). - -Format: - -``` -calc: EXPRESSION -``` - -Check [Expressions](#expressions) for more information. - ---- - -#### Alarm line `every` - -Sets the update frequency of this alarm. This is the same to the `every DURATION` given -in the `lookup` lines. - -Format: - -``` -every: DURATION -``` - -`DURATION` accepts `s` for seconds, `m` is minutes, `h` for hours, `d` for days. - ---- - -#### Alarm lines `green` and `red` - -Set the green and red thresholds of a chart. Both are available as `$green` and `$red` in -expressions. If multiple alarms define different thresholds, the ones defined by the first -alarm will be used. These will eventually visualized on the dashboard, so only one set of -them is allowed. If you need multiple sets of them in different alarms, use absolute numbers -instead of `$red` and `$green`. - -Format: - -``` -green: NUMBER -red: NUMBER -``` - ---- - -#### Alarm lines `warn` and `crit` - -These expressions should evaluate to true or false (alternatively non-zero or zero). -They trigger the alarm. Both are optional. - -Format: - -``` -warn: EXPRESSION -crit: EXPRESSION -``` - -Check [Expressions](#expressions) for more information. - ---- - -#### Alarm line `to` - -This will be the first parameter of the script to be executed when the alarm switches status. -Its meaning is left up to the `exec` script. - -The default `exec` script, `alarm-notify.sh`, uses this field as a space separated list of roles, -which are then consulted to find the exact recipients per notification method. - -Format: - -``` -to: ROLE1 ROLE2 ROLE3 ... -``` - ---- - -#### Alarm line `exec` - -The script that will be executed when the alarm changes status. - -Format: - -``` -exec: SCRIPT -``` - -The default `SCRIPT` is Netdata's `alarm-notify.sh`, which supports all the notifications -methods Netdata supports, including custom hooks. - ---- - -#### Alarm line `delay` - -This is used to provide optional hysteresis settings for the notifications, to defend -against notification floods. These settings do not affect the actual alarm - only the time -the `exec` script is executed. - -Format: - -``` -delay: [[[up U] [down D] multiplier M] max X] -``` - -- `up U` defines the delay to be applied to a notification for an alarm that raised its status - (i.e. CLEAR to WARNING, CLEAR to CRITICAL, WARNING to CRITICAL). For example, `up 10s`, the - notification for this event will be sent 10 seconds after the actual event. This is used in - hope the alarm will get back to its previous state within the duration given. The default `U` - is zero. - -- `down D` defines the delay to be applied to a notification for an alarm that moves to lower - state (i.e. CRITICAL to WARNING, CRITICAL to CLEAR, WARNING to CLEAR). For example, `down 1m` - will delay the notification by 1 minute. This is used to prevent notifications for flapping - alarms. The default `D` is zero. - -- `mutliplier M` multiplies `U` and `D` when an alarm changes state, while a notification is - delayed. The default multiplier is `1.0`. - -- `max X` defines the maximum absolute notification delay an alarm may get. The default `X` - is `max(U * M, D * M)` (i.e. the max duration of `U` or `D` multiplied once with `M`). - - Example: - - `delay: up 10s down 15m multiplier 2 max 1h` - - The time is `00:00:00` and the status of the alarm is CLEAR. + - | time of event | new status | delay | notification will be sent|why| - |-------------|----------|:---:|-------------------------|---| - | 00:00:01 | WARNING | `up 10s` | 00:00:11|first state switch| - | 00:00:05 | CLEAR | `down 15m x2` | 00:30:05|the alarm changes state while a notification is delayed, so it was multiplied| - | 00:00:06 | WARNING | `up 10s x2 x2` | 00:00:26|multiplied twice| - | 00:00:07 | CLEAR | `down 15m x2 x2 x2` | 00:45:07|multiplied 3 times.| - - So: - - - `U` and `D` are multiplied by `M` every time the alarm changes state (any state, not just - their matching one) and a delay is in place. - - All are reset to their defaults when the alarm switches state without a delay in place. - ---- - -#### Alarm line `repeat` - -Defines the interval between repeating notifications for the alarms in CRITICAL or WARNING mode. This will override the default interval settings inherited from health settings in `netdata.conf`. The default settings for repeating notifications are `default repeat warning = DURATION` and `default repeat critical = DURATION` which can be found in health stock configuration, when one of these interval is bigger than 0, Netdata will activate the repeat notification for `CRITICAL`, `CLEAR` and `WARNING` messages. -` - -Format: - -``` -repeat: [off] [warning DURATION] [critical DURATION] -``` - -- `off`: Turns off the repeating feature for the current alarm. This is effective when the default repeat settings has been enabled in health configuration. -- `warning DURATION`: Defines the interval when the alarm is in WARNING state. Use `0s` to turn off the repeating notification for WARNING mode. -- `critical DURATION`: Defines the interval when the alarm is in CRITICAL state. Use `0s` to turn off the repeating notification for CRITICAL mode. - ---- - -#### Alarm line `option` - -The only possible value for the `option` line is - -``` -option: no-clear-notification -``` - -For some alarms we need compare two time-frames, to detect anomalies. For example, `health.d/httpcheck.conf` has an alarm template called `web_service_slow` that compares the average http call response time over the last 3 minutes, compared to the average over the last hour. It triggers a warning alarm when the average of the last 3 minutes is twice the average of the last hour. In such cases, it is easy to trigger the alarm, but difficult to tell when the alarm is cleared. As time passes, the newest window moves into the older, so the average response time of the last hour will keep increasing. Eventually, the comparison will find the averages in the two time-frames close enough to clear the alarm. However, the issue was not resolved, it's just a matter of the newer data "polluting" the old. For such alarms, it's a good idea to tell Netdata to not clear the notification, by using the `no-clear-notification` option. - ---- - -### Expressions - -Netdata has an internal [infix expression parser](../libnetdata/eval). -This parses expressions and creates an internal structure that allows fast execution of them. - -These operators are supported `+`, `-`, `*`, `/`, `<`, `<=`, `<>`, `!=`, `>`, `>=`, `&&`, `||`, -`!`, `AND`, `OR`, `NOT`. Boolean operators result in either `1` (true) or `0` (false). - -The conditional evaluation operator `?` is supported too. Using this operator IF-THEN-ELSE conditional statements can be specified. The format is: `(condition) ? (true expression) :(false expression)`. So, Netdata will first evaluate the `condition` and based on the result will either evaluate `true expression` or `false expression`. - -Example: `($this > 0) ? ($avail * 2) : ($used / 2)`. - -Nested such expressions are also supported (i.e. `true expression` and `false expression` can contain conditional evaluations). - -Expressions also support the `abs()` function. - -Expressions can have variables. Variables start with `$`. Check below for more information. - -There are two special values you can use: - -- `nan`, for example `$this != nan` will check if the variable `this` is available. A variable can be `nan` if the database lookup failed. All calculations (i.e. addition, multiplication, etc) with a `nan` result in a `nan`. - -- `inf`, for example `$this != inf` will check if `this` is not infinite. A value or variable can be infinite if divided by zero. All calculations (i.e. addition, multiplication, etc) with a `inf` result in a `inf`. - ---- - -### Special use of the conditional operator - -A common (but not necessarily obvious) use of the conditional evaluation operator is -to provide [hysteresis](https://en.wikipedia.org/wiki/Hysteresis) around the critical -or warning thresholds. This usage helps to avoid bogus messages resulting from small -variations in the value when it is varying regularly but staying close to the threshold -value, without needing to delay sending messages at all. - -An example of such usage from the default CPU usage alarms bundled with Netdata is: - -``` -warn: $this > (($status >= $WARNING) ? (75) : (85)) -crit: $this > (($status == $CRITICAL) ? (85) : (95)) -``` - -The above say: - -- If the alarm is currently a warning, then the threshold for being considered a warning - is 75, otherwise it's 85. - -- If the alarm is currently critical, then the threshold for being considered critical - is 85, otherwise it's 95. - -Which in turn, results in the following behavior: - -- While the value is rising, it will trigger a warning when it exceeds 85, and a critical - alert when it exceeds 95. - -- While the value is falling, it will return to a warning state when it goes below 85, - and a normal state when it goes below 75. - -- If the value is constantly varying between 80 and 90, then it will trigger a warning the - first time it goes above 85, but will remain a warning until it goes below 75 (or goes above 85). - -- If the value is constantly varying between 90 and 100, then it will trigger a critical alert - the first time it goes above 95, but will remain a critical alert goes below 85 (at which - point it will return to being a warning). - ---- - -### Variables - -You can find all the variables that can be used for a given chart, using -`http://your.netdata.ip:19999/api/v1/alarm_variables?chart=CHART_NAME` -Example: [variables for the `system.cpu` chart of the registry](https://registry.my-netdata.io/api/v1/alarm_variables?chart=system.cpu). - -_Hint: If you don't know how to find the CHART_NAME, you can read about it [here](../web/README.md#charts)._ - -Netdata supports 3 internal indexes for variables that will be used in health monitoring. - -
The variables below can be used in both chart alarms and context templates. - -Although the `alarm_variables` link shows you variables for a particular chart, the same variables can also be used in templates for charts belonging to a given [context](../web/README.md#contexts). The reason is that all charts of a given context are essentially identical, with the only difference being the [family](../web/README.md#families) that identifies a particular hardware or software instance. Charts and templates do not apply to specific families anyway, unless if you explicitly limit an alarm with the [alarm line `families`](#alarm-line-families). -
- -- **chart local variables**. All the dimensions of the chart are exposed as local variables. The value of $this for the other configured alarms of the chart also appears, under the name of each configured alarm. - - Charts also define a few special variables: - - - `$last_collected_t` is the unix timestamp of the last data collection - - `$collected_total_raw` is the sum of all the dimensions (their last collected values) - - `$update_every` is the update frequency of the chart - - `$green` and `$red` the threshold defined in alarms (these are per chart - the charts - inherits them from the the first alarm that defined them) - - Chart dimensions define their last calculated (i.e. interpolated) value, exactly as - shown on the charts, but also a variable with their name and suffix `_raw` that resolves - to the last collected value - as collected and another with suffix `_last_collected_t` - that resolves to unix timestamp the dimension was last collected (there may be dimensions - that fail to be collected while others continue normally). - -- **family variables**. Families are used to group charts together. For example all `eth0` - charts, have `family = eth0`. This index includes all local variables, but if there are - overlapping variables, only the first are exposed. - -- **host variables**. All the dimensions of all charts, including all alarms, in fullname. - Fullname is `CHART.VARIABLE`, where `CHART` is either the chart id or the chart name (both - are supported). - -- **special variables\*** are: - - - `$this`, which is resolved to the value of the current alarm. - - - `$status`, which is resolved to the current status of the alarm (the current = the last - status, i.e. before the current database lookup and the evaluation of the `calc` line). - This values can be compared with `$REMOVED`, `$UNINITIALIZED`, `$UNDEFINED`, `$CLEAR`, - `$WARNING`, `$CRITICAL`. These values are incremental, ie. `$status > $CLEAR` works as - expected. - - - `$now`, which is resolved to current unix timestamp. - -## Alarm Statuses - -Alarms can have the following statuses: - -- `REMOVED` - the alarm has been deleted (this happens when a SIGUSR2 is sent to Netdata - to reload health configuration) - -- `UNINITIALIZED` - the alarm is not initialized yet - -- `UNDEFINED` - the alarm failed to be calculated (i.e. the database lookup failed, - a division by zero occurred, etc) - -- `CLEAR` - the alarm is not armed / raised (i.e. is OK) - -- `WARNING` - the warning expression resulted in true or non-zero - -- `CRITICAL` - the critical expression resulted in true or non-zero - -The external script will be called for all status changes. - -## Examples - -Check the `health/health.d/` directory for all alarms shipped with Netdata. - -Here are a few examples: - -### Example 1 - -A simple check if an apache server is alive: - -``` -template: apache_last_collected_secs - on: apache.requests - calc: $now - $last_collected_t - every: 10s - warn: $this > ( 5 * $update_every) - crit: $this > (10 * $update_every) -``` - -The above checks that Netdata is able to collect data from apache. In detail: - -``` -template: apache_last_collected_secs -``` - -The above defines a **template** named `apache_last_collected_secs`. -The name is important since `$apache_last_collected_secs` resolves to the `calc` line. -So, try to give something descriptive. - -``` - on: apache.requests -``` - -The above applies the **template** to all charts that have `context = apache.requests` -(i.e. all your apache servers). - -``` - calc: $now - $last_collected_t -``` - -- `$now` is a standard variable that resolves to the current timestamp. - -- `$last_collected_t` is the last data collection timestamp of the chart. - So this calculation gives the number of seconds passed since the last data collection. - -``` - every: 10s -``` - -The alarm will be evaluated every 10 seconds. - -``` - warn: $this > ( 5 * $update_every) - crit: $this > (10 * $update_every) -``` - -If these result in non-zero or true, they trigger the alarm. - -- `$this` refers to the value of this alarm (i.e. the result of the `calc` line. - We could also use `$apache_last_collected_secs`. - -`$update_every` is the update frequency of the chart, in seconds. - -So, the warning condition checks if we have not collected data from apache for 5 -iterations and the critical condition checks for 10 iterations. - -### Example 2 - -Check if any of the disks is critically low on disk space: - -``` -template: disk_full_percent - on: disk.space - calc: $used * 100 / ($avail + $used) - every: 1m - warn: $this > 80 - crit: $this > 95 - repeat: warning 120s critical 10s -``` - -`$used` and `$avail` are the `used` and `avail` chart dimensions as shown on the dashboard. - -So, the `calc` line finds the percentage of used space. `$this` resolves to this percentage. - -This is a repeating alarm and if the alarm becomes CRITICAL it repeats the notifications every 10 seconds. It also repeats notifications every 2 minutes if the alarm goes into WARNING mode. - -### Example 3 - -Predict if any disk will run out of space in the near future. - -We do this in 2 steps: - -Calculate the disk fill rate: - -``` - template: disk_fill_rate - on: disk.space - lookup: max -1s at -30m unaligned of avail - calc: ($this - $avail) / (30 * 60) - every: 15s -``` - -In the `calc` line: `$this` is the result of the `lookup` line (i.e. the free space 30 minutes -ago) and `$avail` is the current disk free space. So the `calc` line will either have a positive -number of GB/second if the disk if filling up, or a negative number of GB/second if the disk is -freeing up space. - -There is no `warn` or `crit` lines here. So, this template will just do the calculation and -nothing more. - -Predict the hours after which the disk will run out of space: - -``` - template: disk_full_after_hours - on: disk.space - calc: $avail / $disk_fill_rate / 3600 - every: 10s - warn: $this > 0 and $this < 48 - crit: $this > 0 and $this < 24 -``` - -The `calc` line estimates the time in hours, we will run out of disk space. Of course, only -positive values are interesting for this check, so the warning and critical conditions check -for positive values and that we have enough free space for 48 and 24 hours respectively. - -Once this alarm triggers we will receive an email like this: - -![image](https://cloud.githubusercontent.com/assets/2662304/17839993/87872b32-6802-11e6-8e08-b2e4afef93bb.png) - -### Example 4 - -Check if any network interface is dropping packets: - -``` -template: 30min_packet_drops - on: net.drops - lookup: sum -30m unaligned absolute - every: 10s - crit: $this > 0 -``` - -The `lookup` line will calculate the sum of the all dropped packets in the last 30 minutes. - -The `crit` line will issue a critical alarm if even a single packet has been dropped. - -Note that the drops chart does not exist if a network interface has never dropped a single packet. -When Netdata detects a dropped packet, it will add the chart and it will automatically attach this -alarm to it. - -### Example 5 - -Check if user or system dimension is using more than 50% of cpu: - -``` - alarm: dim_template - on: system.cpu - os: linux -lookup: average -3s percentage foreach system,user - units: % - every: 10s - warn: $this > 50 - crit: $this > 80 -``` - -The `lookup` line will calculate the average CPU usage from system and user in the last 3 seconds. Because we have -the foreach in the `lookup` line, Netdata will create two independent alarms called `dim_template_system` -and `dim_template_user` that will have all the other parameters shared among them. - -### Example 6 +# Health monitoring -Check if all dimensions are using more than 50% of cpu: +The Netdata Agent is a health watchdog for the health and performance of your systems, services, and applications. We've +worked closely with our community of DevOps engineers, SREs, and developers to define hundreds of production-ready +alarms that work without any configuration. -``` - alarm: dim_template - on: system.cpu - os: linux -lookup: average -3s percentage foreach * - units: % - every: 10s - warn: $this > 50 - crit: $this > 80 -``` +The Agent's health monitoring system is also dynamic and fully customizable. You can write entirely new alarms, tune the +community-configured alarms for every app/service [the Agent collects metrics from](/collectors/COLLECTORS.md), or +silence anything you're not interested in. You can even power complex lookups by running statistical algorithms against +your metrics. -The `lookup` line will calculate the average of CPU usage from system and user in the last 3 seconds. In this case -Netdata will create alarms for all dimensions of the chart. +Ready to take the next steps with health monitoring? -## Troubleshooting +[Quickstart](/health/QUICKSTART.md) -You can compile Netdata with [debugging](../daemon#debugging) and then set in `netdata.conf`: +[Configuration reference](/health/REFERENCE.md) -``` -[global] - debug flags = 0x0000000000800000 -``` +## Guides -Then check your `/var/log/netdata/debug.log`. It will show you how it works. -Important: this will generate a lot of output in debug.log. +Every infrastructure is different, so we're not interested in mandating how you should configure Netdata's health +monitoring features. Instead, these guides should give you the details you need to tweak alarms to your heart's +content. -You can find the context of charts by looking up the chart in either -`http://your.netdata:19999/netdata.conf` or `http://your.netdata:19999/api/v1/charts`. +[Stopping notifications for individual alarms](/docs/guides/monitor/stop-notifications-alarms.md) -You can find how Netdata interpreted the expressions by examining the alarm at `http://your.netdata:19999/api/v1/alarms?all`. For each expression, Netdata will return the expression as given in its config file, and the same expression with additional parentheses added to indicate the evaluation flow of the expression. +[Use dimension templates to create dynamic alarms](/docs/guides/monitor/dimension-templates.md) -## Disabling health checks or silencing notifications at runtime +## Related features -It's currently not possible to schedule notifications from within the alarm template. For those scenarios where you need to temporary disable notifications (for instance when running backups triggers a disk alert) you can disable or silence notifications are runtime. The health checks can be controlled at runtime via the [health management api](../web/api/health/#health-management-api). +**[Notifications](/health/notifications/README.md)**: Get notified about ongoing alarms from your Agents via your +favorite platform(s), such as Slack, Discord, PagerDuty, email, and much more. [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/health/REFERENCE.md b/health/REFERENCE.md new file mode 100644 index 000000000..bc5f40ccd --- /dev/null +++ b/health/REFERENCE.md @@ -0,0 +1,797 @@ + + +# Health configuration reference + +Welcome to the health configuration reference. + +This guide contains information about editing health configuration files to tweak existing alarms or create new health +entities that are customized to the needs of your infrastructure. + +To learn the basics of locating and editing health configuration files, see the [health +quickstart](/health/QUICKSTART.md). + +## Health configuration files + +You can configure the Agent's health watchdog service by editing files in two locations: + +- The `[health]` section in `netdata.conf`. By editing the daemon's behavior, you can disable health monitoring + altogether, run health checks more or less often, and more. See [daemon + configuration](/daemon/config/README.md#health-section-options) for a table of all the available settings, their + default values, and what they control. +- The individual `.conf` files in `health.d/`. These health entity files are organized by the type of metric they are + performing calculations on or their associated collector. You should edit these files using the `edit-config` + script. For example: `sudo ./edit-config health.d/cpu.conf`. + +## Health entity reference + +The following reference contains information about the syntax and options of _health entities_, which Netdata attaches +to charts in order to trigger alarms. + +### Entity types + +There are two entity types: **alarms** and **templates**. They have the same format and feature set—the only difference +is their label. + +**Alarms** are attached to specific charts and use the `alarm` label. + +**Templates** define rules that apply to all charts of a specific context, and use the `template` label. Templates help +you apply one entity to all disks, all network interfaces, all MySQL databases, and so on. + +Alarms have higher precedence and will override templates. If an alarm and template entity have the same name and attach +to the same chart, Netdata will use the alarm. + +### Entity format + +Netdata parses the following lines. Beneath the table is an in-depth explanation of each line's purpose and syntax. + +- The `on` and `lookup` lines are **always required**. +- Each entity **must** have one of the following lines: `calc`, `warn`, or `crit`. +- The `alarm` or `template` line must be the first line of any entity. +- A few lines use space-separated lists to define how the entity behaves. You can use `*` as a wildcard or prefix with + `!` for a negative match. Order is important, too! See our [simple patterns docs](../libnetdata/simple_pattern/) for + more examples. + +| line | required | functionality | +| --------------------------------------------------- | --------------- | ------------------------------------------------------------------------------------- | +| [`alarm`/`template`](#alarm-line-alarm-or-template) | yes | Name of the alarm/template. | +| [`on`](#alarm-line-on) | yes | The chart this alarm should attach to. | +| [`os`](#alarm-line-os) | no | Which operating systems to run this chart. | +| [`hosts`](#alarm-line-hosts) | no | Which hostnames will run this alarm. | +| [`plugin`](#alarm-line-plugin) | no | Restrict an alarm or template to only a certain plugin. | +| [`module`](#alarm-line-module) | no | Restrict an alarm or template to only a certain module. | +| [`families`](#alarm-line-families) | no | Restrict a template to only certain families. | +| [`lookup`](#alarm-line-lookup) | yes | The database lookup to find and process metrics for the chart specified through `on`. | +| [`calc`](#alarm-line-calc) | yes (see above) | A calculation to apply to the value found via `lookup` or another variable. | +| [`every`](#alarm-line-every) | no | The frequency of the alarm. | +| [`green`/`red`](#alarm-lines-green-and-red) | no | Set the green and red thresholds of a chart. | +| [`warn`/`crit`](#alarm-lines-warn-and-crit) | yes (see above) | Expressions evaluating to true or false, and when true, will trigger the alarm. | +| [`to`](#alarm-line-to) | no | A list of roles to send notifications to. | +| [`exec`](#alarm-line-exec) | no | The script to execute when the alarm changes status. | +| [`delay`](#alarm-line-delay) | no | Optional hysteresis settings to prevent floods of notifications. | +| [`repeat`](#alarm-line-repeat) | no | The interval for sending notifications when an alarm is in WARNING or CRITICAL mode. | +| [`option`](#alarm-line-option) | no | Add an option to not clear alarms. | +| [`host labels`](#alarm-line-host-labels) | no | List of labels present on a host. | + +The `alarm` or `template` line must be the first line of any entity. + +#### Alarm line `alarm` or `template` + +This line starts an alarm or template based on the [entity type](#entity-types) you're interested in creating. + +**Alarm:** + +```yaml +alarm: NAME +``` + +**Template:** + +```yaml +template: NAME +``` + +`NAME` can be any alpha character, with `.` (period) and `_` (underscore) as the only allowed symbols, but the names +cannot be `chart name`, `dimension name`, `family name`, or `chart variables names`. + +#### Alarm line `on` + +This line defines the chart this alarm should attach to. + +**Alarms:** + +```yaml +on: CHART +``` + +The value `CHART` should be the unique ID or name of the chart you're interested in, as shown on the dashboard. In the +image below, the unique ID is `system.cpu`. + +![Finding the unique ID of a +chart](https://user-images.githubusercontent.com/1153921/67443082-43b16e80-f5b8-11e9-8d33-d6ee052c6678.png) + +**Template:** + +```yaml +on: CONTEXT +``` + +The value `CONTEXT` should be the context you want this template to attach to. + +Need to find the context? Hover over the date on any given chart and look at the tooltip. In the image below, which +shows a disk I/O chart, the tooltip reads: `proc:/proc/diskstats, disk.io`. + +![Finding the context of a chart via the tooltip](https://user-images.githubusercontent.com/1153921/68882856-2b230880-06cd-11ea-923b-b28c4632d479.png) + +You're interested in what comes after the comma: `disk.io`. That's the name of the chart's context. + +If you create a template using the `disk.io` context, it will apply an alarm to every disk available on your system. + +#### Alarm line `os` + +The alarm or template will be used only if the operating system of the host matches this list specified in `os`. The +value is a space-separated list. + +The following example enables the entity on Linux, FreeBSD, and macOS, but no other operating systems. + +```yaml +os: linux freebsd macos +``` + +#### Alarm line `hosts` + +The alarm or template will be used only if the hostname of the host matches this space-separated list. + +The following example will load on systems with the hostnames `server` and `server2`, and any system with hostnames that +begin with `database`. It _will not load_ on the host `redis3`, but will load on any _other_ systems with hostnames that +begin with `redis`. + +```yaml +hosts: server1 server2 database* !redis3 redis* +``` + +#### Alarm line `plugin` + +The `plugin` line filters which plugin within the context this alarm should apply to. The value is a space-separated +list of [simple patterns](/libnetdata/simple_pattern/README.md). For example, +you can create a filter for an alarm that applies specifically to `python.d.plugin`: + +```yaml +plugin: python.d.plugin +``` + +The `plugin` line is best used with other options like `module`. When used alone, the `plugin` line creates a very +inclusive filter that is unlikely to be of much use in production. See [`module`](#alarm-line-module) for a +comprehensive example using both. + +#### Alarm line `module` + +The `module` line filters which module within the context this alarm should apply to. The value is a space-separated +list of [simple patterns](/libnetdata/simple_pattern/README.md). For +example, you can create an alarm that applies only on the `isc_dhcpd` module started by `python.d.plugin`: + +```yaml +plugin: python.d.plugin +module: isc_dhcpd +``` + +#### Alarm line `families` + +The `families` line, used only alongside templates, filters which families within the context this alarm should apply +to. The value is a space-separated list. + +The value is a space-separate list of simple patterns. See our [simple patterns docs](../libnetdata/simple_pattern/) for +some examples. + +For example, you can create a template on the `disk.io` context, but filter it to only the `sda` and `sdb` families: + +```yaml +families: sda sdb +``` + +#### Alarm line `lookup` + +This line makes a database lookup to find a value. This result of this lookup is available as `$this`. + +The format is: + +```yaml +lookup: METHOD AFTER [at BEFORE] [every DURATION] [OPTIONS] [of DIMENSIONS] [foreach DIMENSIONS] +``` + +Everything is the same with [badges](../web/api/badges/). In short: + +- `METHOD` is one of `average`, `min`, `max`, `sum`, `incremental-sum`. + This is required. + +- `AFTER` is a relative number of seconds, but it also accepts a single letter for changing + the units, like `-1s` = 1 second in the past, `-1m` = 1 minute in the past, `-1h` = 1 hour + in the past, `-1d` = 1 day in the past. You need a negative number (i.e. how far in the past + to look for the value). **This is required**. + +- `at BEFORE` is by default 0 and is not required. Using this you can define the end of the + lookup. So data will be evaluated between `AFTER` and `BEFORE`. + +- `every DURATION` sets the updated frequency of the lookup (supports single letter units as + above too). + +- `OPTIONS` is a space separated list of `percentage`, `absolute`, `min2max`, `unaligned`, + `match-ids`, `match-names`. Check the badges documentation for more info. + +- `of DIMENSIONS` is optional and has to be the last parameter. Dimensions have to be separated + by `,` or `|`. The space characters found in dimensions will be kept as-is (a few dimensions + have spaces in their names). This accepts Netdata simple patterns _(with `words` separated by + `,` or `|` instead of spaces)_ and the `match-ids` and `match-names` options affect the searches + for dimensions. + +- `foreach DIMENSIONS` is optional, will always be the last parameter, and uses the same `,`/`|` + rules as the `of` parameter. Each dimension you specify in `foreach` will use the same rule + to trigger an alarm. If you set both `of` and `foreach`, Netdata will ignore the `of` parameter + and replace it with one of the dimensions you gave to `foreach`. + +The result of the lookup will be available as `$this` and `$NAME` in expressions. +The timestamps of the timeframe evaluated by the database lookup is available as variables +`$after` and `$before` (both are unix timestamps). + +#### Alarm line `calc` + +A `calc` is designed to apply some calculation to the values or variables available to the entity. The result of the +calculation will be made available at the `$this` variable, overwriting the value from your `lookup`, to use in warning +and critical expressions. + +When paired with `lookup`, `calc` will perform the calculation just after `lookup` has retrieved a value from Netdata's +database. + +You can use `calc` without `lookup` if you are using [other available variables](#variables). + +The `calc` line uses [expressions](#expressions) for its syntax. + +```yaml +calc: EXPRESSION +``` + +#### Alarm line `every` + +Sets the update frequency of this alarm. This is the same to the `every DURATION` given +in the `lookup` lines. + +Format: + +```yaml +every: DURATION +``` + +`DURATION` accepts `s` for seconds, `m` is minutes, `h` for hours, `d` for days. + +#### Alarm lines `green` and `red` + +Set the green and red thresholds of a chart. Both are available as `$green` and `$red` in expressions. If multiple +alarms define different thresholds, the ones defined by the first alarm will be used. These will eventually visualized +on the dashboard, so only one set of them is allowed. If you need multiple sets of them in different alarms, use +absolute numbers instead of `$red` and `$green`. + +Format: + +```yaml +green: NUMBER +red: NUMBER +``` + +#### Alarm lines `warn` and `crit` + +Define the expression that triggers either a warning or critical alarm. These are optional, and should evaluate to +either true or false (or zero/non-zero). + +The format uses Netdata's [expressions syntax](#expressions). + +```yaml +warn: EXPRESSION +crit: EXPRESSION +``` + +#### Alarm line `to` + +This will be the first parameter of the script to be executed when the alarm switches status. Its meaning is left up to +the `exec` script. + +The default `exec` script, `alarm-notify.sh`, uses this field as a space separated list of roles, which are then +consulted to find the exact recipients per notification method. + +Format: + +```yaml +to: ROLE1 ROLE2 ROLE3 ... +``` + +#### Alarm line `exec` + +The script that will be executed when the alarm changes status. + +Format: + +```yaml +exec: SCRIPT +``` + +The default `SCRIPT` is Netdata's `alarm-notify.sh`, which supports all the notifications methods Netdata supports, +including custom hooks. + +#### Alarm line `delay` + +This is used to provide optional hysteresis settings for the notifications, to defend against notification floods. These +settings do not affect the actual alarm - only the time the `exec` script is executed. + +Format: + +```yaml +delay: [[[up U] [down D] multiplier M] max X] +``` + +- `up U` defines the delay to be applied to a notification for an alarm that raised its status + (i.e. CLEAR to WARNING, CLEAR to CRITICAL, WARNING to CRITICAL). For example, `up 10s`, the + notification for this event will be sent 10 seconds after the actual event. This is used in + hope the alarm will get back to its previous state within the duration given. The default `U` + is zero. + +- `down D` defines the delay to be applied to a notification for an alarm that moves to lower + state (i.e. CRITICAL to WARNING, CRITICAL to CLEAR, WARNING to CLEAR). For example, `down 1m` + will delay the notification by 1 minute. This is used to prevent notifications for flapping + alarms. The default `D` is zero. + +- `multiplier M` multiplies `U` and `D` when an alarm changes state, while a notification is + delayed. The default multiplier is `1.0`. + +- `max X` defines the maximum absolute notification delay an alarm may get. The default `X` + is `max(U * M, D * M)` (i.e. the max duration of `U` or `D` multiplied once with `M`). + + Example: + + `delay: up 10s down 15m multiplier 2 max 1h` + + The time is `00:00:00` and the status of the alarm is CLEAR. + + | time of event | new status | delay | notification will be sent | why | + | ------------- | ---------- | --- | ------------------------- | --- | + | 00:00:01 | WARNING | `up 10s` | 00:00:11 | first state switch | + | 00:00:05 | CLEAR | `down 15m x2` | 00:30:05 | the alarm changes state while a notification is delayed, so it was multiplied | + | 00:00:06 | WARNING | `up 10s x2 x2` | 00:00:26 | multiplied twice | + | 00:00:07 | CLEAR | `down 15m x2 x2 x2` | 00:45:07 | multiplied 3 times. | + + So: + + - `U` and `D` are multiplied by `M` every time the alarm changes state (any state, not just + their matching one) and a delay is in place. + - All are reset to their defaults when the alarm switches state without a delay in place. + +#### Alarm line `repeat` + +Defines the interval between repeating notifications for the alarms in CRITICAL or WARNING mode. This will override the +default interval settings inherited from health settings in `netdata.conf`. The default settings for repeating +notifications are `default repeat warning = DURATION` and `default repeat critical = DURATION` which can be found in +health stock configuration, when one of these interval is bigger than 0, Netdata will activate the repeat notification +for `CRITICAL`, `CLEAR` and `WARNING` messages. + +Format: + +```yaml +repeat: [off] [warning DURATION] [critical DURATION] +``` + +- `off`: Turns off the repeating feature for the current alarm. This is effective when the default repeat settings has + been enabled in health configuration. +- `warning DURATION`: Defines the interval when the alarm is in WARNING state. Use `0s` to turn off the repeating + notification for WARNING mode. +- `critical DURATION`: Defines the interval when the alarm is in CRITICAL state. Use `0s` to turn off the repeating + notification for CRITICAL mode. + +#### Alarm line `option` + +The only possible value for the `option` line is + +```yaml +option: no-clear-notification +``` + +For some alarms we need compare two time-frames, to detect anomalies. For example, `health.d/httpcheck.conf` has an +alarm template called `web_service_slow` that compares the average http call response time over the last 3 minutes, +compared to the average over the last hour. It triggers a warning alarm when the average of the last 3 minutes is twice +the average of the last hour. In such cases, it is easy to trigger the alarm, but difficult to tell when the alarm is +cleared. As time passes, the newest window moves into the older, so the average response time of the last hour will keep +increasing. Eventually, the comparison will find the averages in the two time-frames close enough to clear the alarm. +However, the issue was not resolved, it's just a matter of the newer data "polluting" the old. For such alarms, it's a +good idea to tell Netdata to not clear the notification, by using the `no-clear-notification` option. + +#### Alarm line `host labels` + +Defines the list of labels present on a host. See our [host labels guide](/docs/guides/using-host-labels.md) for +an explanation of host labels and how to implement them. + +For example, let's suppose that `netdata.conf` is configured with the following labels: + +```yaml +[host labels] + installed = 20191211 + room = server +``` + +And more labels in `netdata.conf` for workstations: + +```yaml +[host labels] + installed = 201705 + room = workstation +``` + +By defining labels inside of `netdata.conf`, you can now apply labels to alarms. For example, you can add the following +line to any alarms you'd like to apply to hosts that have the label `room = server`. + +```yaml +host labels: room = server +``` + +The `host labels` is a space-separated list that accepts simple patterns. For example, you can create an alarm +that will be applied to all hosts installed in the last decade with the following line: + +```yaml +host labels: installed = 201* +``` + +See our [simple patterns docs](../libnetdata/simple_pattern/) for more examples. + +## Expressions + +Netdata has an internal [infix expression parser](../libnetdata/eval). This parses expressions and creates an internal +structure that allows fast execution of them. + +These operators are supported `+`, `-`, `*`, `/`, `<`, `<=`, `<>`, `!=`, `>`, `>=`, `&&`, `||`, `!`, `AND`, `OR`, `NOT`. +Boolean operators result in either `1` (true) or `0` (false). + +The conditional evaluation operator `?` is supported too. Using this operator IF-THEN-ELSE conditional statements can be +specified. The format is: `(condition) ? (true expression) : (false expression)`. So, Netdata will first evaluate the +`condition` and based on the result will either evaluate `true expression` or `false expression`. + +Example: `($this > 0) ? ($avail * 2) : ($used / 2)`. + +Nested such expressions are also supported (i.e. `true expression` and `false expression` can contain conditional +evaluations). + +Expressions also support the `abs()` function. + +Expressions can have variables. Variables start with `$`. Check below for more information. + +There are two special values you can use: + +- `nan`, for example `$this != nan` will check if the variable `this` is available. A variable can be `nan` if the + database lookup failed. All calculations (i.e. addition, multiplication, etc) with a `nan` result in a `nan`. + +- `inf`, for example `$this != inf` will check if `this` is not infinite. A value or variable can be set to infinite + if divided by zero. All calculations (i.e. addition, multiplication, etc) with a `inf` result in a `inf`. + +### Special use of the conditional operator + +A common (but not necessarily obvious) use of the conditional evaluation operator is to provide +[hysteresis](https://en.wikipedia.org/wiki/Hysteresis) around the critical or warning thresholds. This usage helps to +avoid bogus messages resulting from small variations in the value when it is varying regularly but staying close to the +threshold value, without needing to delay sending messages at all. + +An example of such usage from the default CPU usage alarms bundled with Netdata is: + +```yaml +warn: $this > (($status >= $WARNING) ? (75) : (85)) +crit: $this > (($status == $CRITICAL) ? (85) : (95)) +``` + +The above say: + +- If the alarm is currently a warning, then the threshold for being considered a warning is 75, otherwise it's 85. + +- If the alarm is currently critical, then the threshold for being considered critical is 85, otherwise it's 95. + +Which in turn, results in the following behavior: + +- While the value is rising, it will trigger a warning when it exceeds 85, and a critical alert when it exceeds 95. + +- While the value is falling, it will return to a warning state when it goes below 85, and a normal state when it goes + below 75. + +- If the value is constantly varying between 80 and 90, then it will trigger a warning the first time it goes above + 85, but will remain a warning until it goes below 75 (or goes above 85). + +- If the value is constantly varying between 90 and 100, then it will trigger a critical alert the first time it goes + above 95, but will remain a critical alert goes below 85 (at which point it will return to being a warning). + +## Variables + +You can find all the variables that can be used for a given chart, using +`http://NODE:19999/api/v1/alarm_variables?chart=CHART_NAME`, replacing `NODE` with the IP address or hostname for your +Agent dashboard. For example, [variables for the `system.cpu` chart of the +registry](https://registry.my-netdata.io/api/v1/alarm_variables?chart=system.cpu). + +> If you don't know how to find the CHART_NAME, you can read about it [here](../web/README.md#charts). + +Netdata supports 3 internal indexes for variables that will be used in health monitoring. + +
The variables below can be used in both chart alarms and context templates. + +Although the `alarm_variables` link shows you variables for a particular chart, the same variables can also be used in +templates for charts belonging to a given [context](../web/README.md#contexts). The reason is that all charts of a given +context are essentially identical, with the only difference being the [family](../web/README.md#families) that +identifies a particular hardware or software instance. Charts and templates do not apply to specific families anyway, +unless if you explicitly limit an alarm with the [alarm line `families`](#alarm-line-families). + +
+ +- **chart local variables**. All the dimensions of the chart are exposed as local variables. The value of `$this` for + the other configured alarms of the chart also appears, under the name of each configured alarm. + + Charts also define a few special variables: + + - `$last_collected_t` is the unix timestamp of the last data collection + - `$collected_total_raw` is the sum of all the dimensions (their last collected values) + - `$update_every` is the update frequency of the chart + - `$green` and `$red` the threshold defined in alarms (these are per chart - the charts + inherits them from the the first alarm that defined them) + + Chart dimensions define their last calculated (i.e. interpolated) value, exactly as + shown on the charts, but also a variable with their name and suffix `_raw` that resolves + to the last collected value - as collected and another with suffix `_last_collected_t` + that resolves to unix timestamp the dimension was last collected (there may be dimensions + that fail to be collected while others continue normally). + +- **family variables**. Families are used to group charts together. For example all `eth0` + charts, have `family = eth0`. This index includes all local variables, but if there are + overlapping variables, only the first are exposed. + +- **host variables**. All the dimensions of all charts, including all alarms, in fullname. + Fullname is `CHART.VARIABLE`, where `CHART` is either the chart id or the chart name (both + are supported). + +- **special variables\*** are: + + - `$this`, which is resolved to the value of the current alarm. + + - `$status`, which is resolved to the current status of the alarm (the current = the last + status, i.e. before the current database lookup and the evaluation of the `calc` line). + This values can be compared with `$REMOVED`, `$UNINITIALIZED`, `$UNDEFINED`, `$CLEAR`, + `$WARNING`, `$CRITICAL`. These values are incremental, ie. `$status > $CLEAR` works as + expected. + + - `$now`, which is resolved to current unix timestamp. + +## Alarm statuses + +Alarms can have the following statuses: + +- `REMOVED` - the alarm has been deleted (this happens when a SIGUSR2 is sent to Netdata + to reload health configuration) + +- `UNINITIALIZED` - the alarm is not initialized yet + +- `UNDEFINED` - the alarm failed to be calculated (i.e. the database lookup failed, + a division by zero occurred, etc) + +- `CLEAR` - the alarm is not armed / raised (i.e. is OK) + +- `WARNING` - the warning expression resulted in true or non-zero + +- `CRITICAL` - the critical expression resulted in true or non-zero + +The external script will be called for all status changes. + +## Example alarms + +Check the `health/health.d/` directory for all alarms shipped with Netdata. + +Here are a few examples: + +### Example 1 + +A simple check if an apache server is alive: + +```yaml +template: apache_last_collected_secs + on: apache.requests + calc: $now - $last_collected_t + every: 10s + warn: $this > ( 5 * $update_every) + crit: $this > (10 * $update_every) +``` + +The above checks that Netdata is able to collect data from apache. In detail: + +```yaml +template: apache_last_collected_secs +``` + +The above defines a **template** named `apache_last_collected_secs`. +The name is important since `$apache_last_collected_secs` resolves to the `calc` line. +So, try to give something descriptive. + +```yaml + on: apache.requests +``` + +The above applies the **template** to all charts that have `context = apache.requests` +(i.e. all your apache servers). + +```yaml + calc: $now - $last_collected_t +``` + +- `$now` is a standard variable that resolves to the current timestamp. + +- `$last_collected_t` is the last data collection timestamp of the chart. + So this calculation gives the number of seconds passed since the last data collection. + +```yaml + every: 10s +``` + +The alarm will be evaluated every 10 seconds. + +```yaml + warn: $this > ( 5 * $update_every) + crit: $this > (10 * $update_every) +``` + +If these result in non-zero or true, they trigger the alarm. + +- `$this` refers to the value of this alarm (i.e. the result of the `calc` line. + We could also use `$apache_last_collected_secs`. + +`$update_every` is the update frequency of the chart, in seconds. + +So, the warning condition checks if we have not collected data from apache for 5 +iterations and the critical condition checks for 10 iterations. + +### Example 2 + +Check if any of the disks is critically low on disk space: + +```yaml +template: disk_full_percent + on: disk.space + calc: $used * 100 / ($avail + $used) + every: 1m + warn: $this > 80 + crit: $this > 95 + repeat: warning 120s critical 10s +``` + +`$used` and `$avail` are the `used` and `avail` chart dimensions as shown on the dashboard. + +So, the `calc` line finds the percentage of used space. `$this` resolves to this percentage. + +This is a repeating alarm and if the alarm becomes CRITICAL it repeats the notifications every 10 seconds. It also +repeats notifications every 2 minutes if the alarm goes into WARNING mode. + +### Example 3 + +Predict if any disk will run out of space in the near future. + +We do this in 2 steps: + +Calculate the disk fill rate: + +```yaml + template: disk_fill_rate + on: disk.space + lookup: max -1s at -30m unaligned of avail + calc: ($this - $avail) / (30 * 60) + every: 15s +``` + +In the `calc` line: `$this` is the result of the `lookup` line (i.e. the free space 30 minutes +ago) and `$avail` is the current disk free space. So the `calc` line will either have a positive +number of GB/second if the disk if filling up, or a negative number of GB/second if the disk is +freeing up space. + +There is no `warn` or `crit` lines here. So, this template will just do the calculation and +nothing more. + +Predict the hours after which the disk will run out of space: + +```yaml + template: disk_full_after_hours + on: disk.space + calc: $avail / $disk_fill_rate / 3600 + every: 10s + warn: $this > 0 and $this < 48 + crit: $this > 0 and $this < 24 +``` + +The `calc` line estimates the time in hours, we will run out of disk space. Of course, only +positive values are interesting for this check, so the warning and critical conditions check +for positive values and that we have enough free space for 48 and 24 hours respectively. + +Once this alarm triggers we will receive an email like this: + +![image](https://cloud.githubusercontent.com/assets/2662304/17839993/87872b32-6802-11e6-8e08-b2e4afef93bb.png) + +### Example 4 + +Check if any network interface is dropping packets: + +```yaml +template: 30min_packet_drops + on: net.drops + lookup: sum -30m unaligned absolute + every: 10s + crit: $this > 0 +``` + +The `lookup` line will calculate the sum of the all dropped packets in the last 30 minutes. + +The `crit` line will issue a critical alarm if even a single packet has been dropped. + +Note that the drops chart does not exist if a network interface has never dropped a single packet. +When Netdata detects a dropped packet, it will add the chart and it will automatically attach this +alarm to it. + +### Example 5 + +Check if user or system dimension is using more than 50% of cpu: + +```yaml + alarm: dim_template + on: system.cpu + os: linux +lookup: average -3s percentage foreach system,user + units: % + every: 10s + warn: $this > 50 + crit: $this > 80 +``` + +The `lookup` line will calculate the average CPU usage from system and user in the last 3 seconds. Because we have +the foreach in the `lookup` line, Netdata will create two independent alarms called `dim_template_system` +and `dim_template_user` that will have all the other parameters shared among them. + +### Example 6 + +Check if all dimensions are using more than 50% of cpu: + +```yaml + alarm: dim_template + on: system.cpu + os: linux +lookup: average -3s percentage foreach * + units: % + every: 10s + warn: $this > 50 + crit: $this > 80 +``` + +The `lookup` line will calculate the average of CPU usage from system and user in the last 3 seconds. In this case +Netdata will create alarms for all dimensions of the chart. + +## Troubleshooting + +You can compile Netdata with [debugging](/daemon/README.md#debugging) and then set in `netdata.conf`: + +```yaml +[global] + debug flags = 0x0000000000800000 +``` + +Then check your `/var/log/netdata/debug.log`. It will show you how it works. Important: this will generate a lot of +output in debug.log. + +You can find the context of charts by looking up the chart in either `http://NODE:19999/netdata.conf` or +`http://NODE:19999/api/v1/charts`, replacing `NODE` with the IP address or hostname for your Agent dashboard. + +You can find how Netdata interpreted the expressions by examining the alarm at +`http://NODE:19999/api/v1/alarms?all`. For each expression, Netdata will return the expression as given in its +config file, and the same expression with additional parentheses added to indicate the evaluation flow of the +expression. + +## Disabling health checks or silencing notifications at runtime + +It's currently not possible to schedule notifications from within the alarm template. For those scenarios where you need +to temporary disable notifications (for instance when running backups triggers a disk alert) you can disable or silence +notifications are runtime. The health checks can be controlled at runtime via the [health management +api](/web/api/health/README.md). + +[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2Freference%2F&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/health/health.c b/health/health.c index 329191fb8..b81361e8a 100644 --- a/health/health.c +++ b/health/health.c @@ -2,14 +2,49 @@ #include "health.h" -struct health_cmdapi_thread_status { - int status; - ; - struct rusage rusage; -}; - unsigned int default_health_enabled = 1; +char *silencers_filename; + +// the queue of executed alarm notifications that haven't been waited for yet +static struct { + ALARM_ENTRY *head; // oldest + ALARM_ENTRY *tail; // latest +} alarm_notifications_in_progress = {NULL, NULL}; + +static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae) +{ + ae->prev_in_progress = NULL; + ae->next_in_progress = NULL; + + if (NULL != alarm_notifications_in_progress.tail) { + ae->prev_in_progress = alarm_notifications_in_progress.tail; + alarm_notifications_in_progress.tail->next_in_progress = ae; + } + if (NULL == alarm_notifications_in_progress.head) { + alarm_notifications_in_progress.head = ae; + } + alarm_notifications_in_progress.tail = ae; + +} +static inline void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae) +{ + struct alarm_entry *prev = ae->prev_in_progress; + struct alarm_entry *next = ae->next_in_progress; + + if (NULL != prev) { + prev->next_in_progress = next; + } + if (NULL != next) { + next->prev_in_progress = prev; + } + if (ae == alarm_notifications_in_progress.head) { + alarm_notifications_in_progress.head = next; + } + if (ae == alarm_notifications_in_progress.tail) { + alarm_notifications_in_progress.tail = prev; + } +} // ---------------------------------------------------------------------------- // health initialization @@ -44,7 +79,7 @@ inline char *health_stock_config_dir(void) { * * Function used to initialize the silencer structure. */ -void health_silencers_init(void) { +static void health_silencers_init(void) { FILE *fd = fopen(silencers_filename, "r"); if (fd) { fseek(fd, 0 , SEEK_END); @@ -70,7 +105,7 @@ void health_silencers_init(void) { } fclose(fd); } else { - error("Cannot open the file %s",silencers_filename); + info("Cannot open the file %s, so Netdata will work with the default health configuration.",silencers_filename); } } @@ -100,7 +135,7 @@ void health_init(void) { * * @param host the structure of the host that the function will reload the configuration. */ -void health_reload_host(RRDHOST *host) { +static void health_reload_host(RRDHOST *host) { if(unlikely(!host->health_enabled)) return; @@ -152,9 +187,14 @@ void health_reload_host(RRDHOST *host) { rrdhost_wrlock(host); health_readdir(host, user_path, stock_path, NULL); + //Discard alarms with labels that do not apply to host + rrdcalc_labels_unlink_alarm_from_host(host); + // link the loaded alarms to their charts RRDDIM *rd; rrdset_foreach_write(st, host) { + if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED)) + continue; rrdsetcalc_link_matching(st); rrdcalctemplate_link_matching(st); @@ -175,7 +215,10 @@ void health_reload_host(RRDHOST *host) { * Reload the host configuration for all hosts. */ void health_reload(void) { - +#ifdef ENABLE_ACLK + if (netdata_cloud_setting) + aclk_single_update_disable(); +#endif rrd_rdlock(); RRDHOST *host; @@ -183,6 +226,12 @@ void health_reload(void) { health_reload_host(host); rrd_unlock(); +#ifdef ENABLE_ACLK + if (netdata_cloud_setting) { + aclk_single_update_enable(); + aclk_alarm_reload(); + } +#endif } // ---------------------------------------------------------------------------- @@ -252,7 +301,6 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { } static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1]; - pid_t command_pid; const char *exec = (ae->exec) ? ae->exec : host->health_default_exec; const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient; @@ -308,25 +356,30 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { ); ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN; - ae->exec_run_timestamp = now_realtime_sec(); + ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */ debug(D_HEALTH, "executing command '%s'", command_to_run); - FILE *fp = mypopen(command_to_run, &command_pid); - if(!fp) { - error("HEALTH: Cannot popen(\"%s\", \"r\").", command_to_run); - goto done; - } - debug(D_HEALTH, "HEALTH reading from command (discarding command's output)"); - char buffer[100 + 1]; - while(fgets(buffer, 100, fp) != NULL) ; - ae->exec_code = mypclose(fp, command_pid); + ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS; + ae->exec_spawn_serial = spawn_enq_cmd(command_to_run); + enqueue_alarm_notify_in_progress(ae); + + return; //health_alarm_wait_for_execution +done: + health_alarm_log_save(host, ae); +} + +static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) { + if (!(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS)) + return; + + spawn_wait_cmd(ae->exec_spawn_serial, &ae->exec_code, &ae->exec_run_timestamp); debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code); + ae->flags &= ~HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS; if(ae->exec_code != 0) ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED; -done: - health_alarm_log_save(host, ae); + unlink_alarm_notify_in_progress(ae); } static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) { @@ -388,6 +441,7 @@ static inline void health_alarm_log_process(RRDHOST *host) { ALARM_ENTRY *t = ae->next; if(likely(!alarm_entry_isrepeating(host, ae))) { + health_alarm_wait_for_execution(ae); health_alarm_log_free_one_nochecks_nounlink(ae); host->health_log.count--; } @@ -430,14 +484,21 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) return 0; } + if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) { + debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rc->chart?rc->chart:"NOCHART", rc->name); + return 0; + } + if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) { debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name); return 0; } int update_every = rc->rrdset->update_every; - time_t first = rrdset_first_entry_t(rc->rrdset); - time_t last = rrdset_last_entry_t(rc->rrdset); + rrdset_rdlock(rc->rrdset); + time_t first = rrdset_first_entry_t_nolock(rc->rrdset); + time_t last = rrdset_last_entry_t_nolock(rc->rrdset); + rrdset_unlock(rc->rrdset); if(unlikely(now + update_every < first /* || now - update_every > last */)) { debug(D_HEALTH @@ -488,7 +549,7 @@ static void health_main_cleanup(void *ptr) { static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; } -SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) { +static SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) { SILENCER *s; debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s", rc->name, (rc->rrdset)?rc->rrdset->context:"", rc->chart, host, (rc->rrdset)?rc->rrdset->family:""); @@ -530,7 +591,7 @@ SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) { * * @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise */ -int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { +static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { uint32_t rrdcalc_flags_old = rc->rrdcalc_flags; // Clear the flags rc->rrdcalc_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED); @@ -577,6 +638,8 @@ void *health_main(void *ptr) { time_t now = now_realtime_sec(); time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60); + rrdcalc_labels_unlink(); + unsigned int loop = 0; while(!netdata_exit) { loop++; @@ -930,6 +993,7 @@ void *health_main(void *ptr) { rc->rrdcalc_flags |= RRDCALC_FLAG_RUN_ONCE; health_process_notifications(host, ae); debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id); + health_alarm_wait_for_execution(ae); health_alarm_log_free_one_nochecks_nounlink(ae); } } @@ -944,11 +1008,23 @@ void *health_main(void *ptr) { // and cleanup health_alarm_log_process(host); - if (unlikely(netdata_exit)) + if (unlikely(netdata_exit)) { + // wait for all notifications to finish before allowing health to be cleaned up + ALARM_ENTRY *ae; + while (NULL != (ae = alarm_notifications_in_progress.head)) { + health_alarm_wait_for_execution(ae); + } break; + } } /* rrdhost_foreach */ + // wait for all notifications to finish before allowing health to be cleaned up + ALARM_ENTRY *ae; + while (NULL != (ae = alarm_notifications_in_progress.head)) { + health_alarm_wait_for_execution(ae); + } + rrd_unlock(); diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf new file mode 100644 index 000000000..a2d248efe --- /dev/null +++ b/health/health.d/anomalies.conf @@ -0,0 +1,17 @@ +# raise a warning alarm if an anomaly probability is consistently above 50% + +template: anomaly_probabilities + on: anomalies.probability + lookup: average -2m foreach * + every: 1m + warn: $this > 50 + info: average anomaly probability > 50% for last 2 minutes + +# raise a warning alarm if an anomaly flag is consistently firing + +template: anomaly_flags + on: anomalies.anomaly + lookup: sum -2m foreach * + every: 1m + warn: $this > 10 + info: count of anomalies > 10 for last 2 minutes diff --git a/health/health.d/apps_plugin.conf b/health/health.d/apps_plugin.conf new file mode 100644 index 000000000..9a27bc6ba --- /dev/null +++ b/health/health.d/apps_plugin.conf @@ -0,0 +1,15 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# disabled due to https://github.com/netdata/netdata/issues/10327 +# +# alarm: used_file_descriptors +# on: apps.files +# hosts: * +# calc: $fdperc +# units: % +# every: 5s +# warn: $this > (($status >= $WARNING) ? (75) : (80)) +# crit: $this > (($status == $CRITICAL) ? (85) : (90)) +# delay: down 5m multiplier 1.5 max 1h +# info: Peak percentage of file descriptors used +# to: sysadmin diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf index 7af100d8f..e51b8aa5f 100644 --- a/health/health.d/backend.conf +++ b/health/health.d/backend.conf @@ -1,3 +1,13 @@ +# Alert that backends subsystem will be disabled soon + alarm: backend_metrics_eol + on: netdata.backend_metrics + units: boolean + calc: $now - $last_collected_t + every: 1m + warn: $this > 0 + delay: down 5m multiplier 1.5 max 1h + info: The backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf. + to: sysadmin # make sure we are sending data to backend @@ -32,6 +42,7 @@ info: number of metrics lost due to repeating failures to contact the backend server to: dba + # this chart has been removed from netdata # alarm: backend_slow # on: netdata.backend_latency diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf new file mode 100644 index 000000000..8ab2c9d0f --- /dev/null +++ b/health/health.d/cockroachdb.conf @@ -0,0 +1,91 @@ + +# Availability + +template: cockroachdb_last_collected_secs + on: cockroachdb.live_nodes + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: dba + +# Capacity + +template: cockroachdb_used_storage_capacity + on: cockroachdb.storage_used_capacity_percentage + calc: $capacity_used_percent + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + info: entire disk usage percentage + to: dba + +template: cockroachdb_used_usable_storage_capacity + on: cockroachdb.storage_used_capacity_percentage + calc: $capacity_usable_used_percent + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + info: usable space usage percentage + to: dba + +# Replication + +template: cockroachdb_unavailable_ranges + on: cockroachdb.ranges_replication_problem + calc: $ranges_unavailable + units: num + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: number of ranges with fewer live replicas than the replication target + to: dba + +template: cockroachdb_replicas_leaders_not_leaseholders + on: cockroachdb.replicas_leaders + calc: $replicas_leaders_not_leaseholders + units: num + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: number of replicas that are Raft leaders whose range lease is held by another store + to: dba + +# FD + +template: cockroachdb_open_file_descriptors_limit + on: cockroachdb.process_file_descriptors + calc: $sys_fd_open/$sys_fd_softlimit * 100 + units: % + every: 10s + warn: $this > 80 + delay: down 15m multiplier 1.5 max 1h + info: open file descriptors usage percentage + to: dba + +# SQL + +template: cockroachdb_sql_active_connections + on: cockroachdb.sql_connections + calc: $sql_conns + units: active connections + every: 10s + info: number of active SQL connections + to: dba + +template: cockroachdb_sql_executed_statements_total_last_5m + on: cockroachdb.sql_statements_total + lookup: sum -5m absolute of sql_query_count + units: statements + every: 10s + warn: $this == 0 AND $cockroachdb_sql_active_connections != 0 + delay: down 15m up 30s multiplier 1.5 max 1h + info: number of executed SQL statements in the last 5 minutes + to: dba diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf index ce9839ef1..274673e3e 100644 --- a/health/health.d/dbengine.conf +++ b/health/health.d/dbengine.conf @@ -5,7 +5,7 @@ on: netdata.dbengine_global_errors os: linux freebsd macos hosts: * -lookup: sum -10m unaligned of FS errors +lookup: sum -10m unaligned of fs_errors units: errors every: 10s crit: $this > 0 @@ -17,7 +17,7 @@ lookup: sum -10m unaligned of FS errors on: netdata.dbengine_global_errors os: linux freebsd macos hosts: * -lookup: sum -10m unaligned of I/O errors +lookup: sum -10m unaligned of io_errors units: errors every: 10s crit: $this > 0 @@ -25,14 +25,26 @@ lookup: sum -10m unaligned of I/O errors info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc) to: sysadmin - alarm: 10min_dbengine_global_flushing_errors + alarm: 10min_dbengine_global_flushing_warnings on: netdata.dbengine_global_errors os: linux freebsd macos hosts: * -lookup: sum -10m unaligned of flushing errors +lookup: sum -10m unaligned of pg_cache_over_half_dirty_events units: errors - every: 3s - crit: $this > 0 + every: 10s + warn: $this > 0 + delay: down 1h multiplier 1.5 max 3h + info: number of times in the last 10 minutes that dbengine dirty pages were over 50% of the instance's page cache, metric data at risk of not being stored in the database, please reduce disk load or use faster disks + to: sysadmin + + alarm: 10min_dbengine_global_flushing_errors + on: netdata.dbengine_long_term_page_stats + os: linux freebsd macos + hosts: * +lookup: sum -10m unaligned of flushing_pressure_deletions + units: pages + every: 10s + crit: $this != 0 delay: down 1h multiplier 1.5 max 3h - info: number of times in the last 10 minutes that the dbengine failed to completely flush data to disk, metric data will not be stored in the database, please reduce disk load or use a faster disk + info: number of pages deleted due to failure to flush data to disk in the last 10 minutes, metric data were lost to unblock data collection, please reduce disk load or use faster disks to: sysadmin diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf new file mode 100644 index 000000000..113c950e6 --- /dev/null +++ b/health/health.d/dns_query.conf @@ -0,0 +1,12 @@ + +# detect dns query failure + +template: dns_query_time_query_time + on: dns_query_time.query_time + lookup: average -10s unaligned foreach * + units: ms + every: 10s + warn: $this == nan + delay: up 20s down 5m multiplier 1.5 max 1h + info: query round trip time + to: sysadmin diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf index dffd40965..f4423449f 100644 --- a/health/health.d/elasticsearch.conf +++ b/health/health.d/elasticsearch.conf @@ -1,5 +1,8 @@ - alarm: elasticsearch_last_collected - on: elasticsearch_local.cluster_health_status + +# make sure elasticsearch is running + +template: elasticsearch_last_collected + on: elasticsearch.cluster_health_status calc: $now - $last_collected_t units: seconds ago every: 10s diff --git a/health/health.d/exporting.conf b/health/health.d/exporting.conf new file mode 100644 index 000000000..506cb0cf7 --- /dev/null +++ b/health/health.d/exporting.conf @@ -0,0 +1,34 @@ + +template: exporting_last_buffering +families: * + on: exporting_data_size + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful buffering of exporting data + to: dba + +template: exporting_metrics_sent +families: * + on: exporting_data_size + units: % + calc: abs($sent) * 100 / abs($buffered) + every: 10s + warn: $this != 100 + delay: down 5m multiplier 1.5 max 1h + info: percentage of metrics sent to the external database server + to: dba + +template: exporting_metrics_lost +families: * + on: exporting_data_size + units: metrics + calc: abs($lost) + every: 10s + crit: ($this != 0) || ($status == $CRITICAL && abs($sent) == 0) + delay: down 5m multiplier 1.5 max 1h + info: number of metrics lost due to repeating failures to contact the external database server + to: dba diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf index a53ec7a56..2f906e187 100644 --- a/health/health.d/mdstat.conf +++ b/health/health.d/mdstat.conf @@ -12,7 +12,7 @@ template: mdstat_disks on: md.disks units: failed devices every: 10s - calc: $total - $inuse + calc: $down crit: $this > 0 info: Array is degraded! to: sysadmin @@ -21,8 +21,9 @@ template: mdstat_mismatch_cnt on: md.mismatch_cnt units: unsynchronized blocks calc: $count - every: 10s - crit: $this > 0 + every: 60s + warn: $this > 1024 + delay: up 30m info: Mismatch count! to: sysadmin diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf index 73b87dcc0..6e81a2a0e 100644 --- a/health/health.d/megacli.conf +++ b/health/health.d/megacli.conf @@ -1,4 +1,4 @@ - alarm: adapter_state +template: adapter_state on: megacli.adapter_degraded units: is degraded lookup: sum -10s @@ -27,7 +27,7 @@ template: bbu_cycle_count info: BBU cycle count to: sysadmin - alarm: pd_media_errors +template: pd_media_errors on: megacli.pd_media_error units: media errors lookup: sum -10s @@ -37,7 +37,7 @@ template: bbu_cycle_count info: physical drive media errors to: sysadmin - alarm: pd_predictive_failures +template: pd_predictive_failures on: megacli.pd_predictive_failure units: predictive failures lookup: sum -10s diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf index 2bec56387..62cef5a2e 100644 --- a/health/health.d/mysql.conf +++ b/health/health.d/mysql.conf @@ -79,7 +79,7 @@ template: mysql_connections template: mysql_replication on: mysql.slave_status - calc: ($sql_running == -1 OR $io_running == -1)?0:1 + calc: ($sql_running <= 0 OR $io_running <= 0)?0:1 units: ok/failed every: 10s crit: $this == 0 diff --git a/health/health.d/net.conf b/health/health.d/net.conf index e43cb1691..261290e51 100644 --- a/health/health.d/net.conf +++ b/health/health.d/net.conf @@ -110,6 +110,34 @@ families: * info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes to: sysadmin +# ----------------------------------------------------------------------------- +# interface errors + +template: interface_inbound_errors + on: net.errors + os: freebsd + hosts: * +families: * + lookup: sum -10m unaligned absolute of inbound + units: errors + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface inbound errors in the last 10 minutes + to: sysadmin + +template: interface_outbound_errors + on: net.errors + os: freebsd + hosts: * +families: * + lookup: sum -10m unaligned absolute of outbound + units: errors + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface outbound errors in the last 10 minutes + to: sysadmin # ----------------------------------------------------------------------------- # FIFO errors @@ -132,7 +160,6 @@ families: * info: interface fifo errors in the last 10 minutes to: sysadmin - # ----------------------------------------------------------------------------- # check for packet storms diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf index f42b63d30..696333fd8 100644 --- a/health/health.d/portcheck.conf +++ b/health/health.d/portcheck.conf @@ -31,18 +31,16 @@ families: * crit: $this >= 40 delay: down 5m multiplier 1.5 max 1h info: average of timeouts during the last 5 minutes - options: no-clear-notification to: sysadmin template: connection_fails families: * on: portcheck.status - lookup: average -5m unaligned percentage of no_connection + lookup: average -5m unaligned percentage of no_connection,failed every: 10s units: % warn: $this >= 10 AND $this < 40 crit: $this >= 40 delay: down 5m multiplier 1.5 max 1h info: average of failed connections during the last 5 minutes - options: no-clear-notification to: sysadmin diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf index d96998fdf..293f1aa0d 100644 --- a/health/health.d/processes.conf +++ b/health/health.d/processes.conf @@ -1,27 +1,13 @@ # you can disable an alarm notification by setting the 'to' line to: silent - alarm: active_processes_limit_freebsd + alarm: active_processes on: system.active_processes - os: freebsd hosts: * - calc: $active - units: processes + calc: $active * 100 / $pidmax + units: % every: 5s - warn: $this > (($status >= $WARNING) ? (75000) : (80000)) - crit: $this > (($status == $CRITICAL) ? (85000) : (90000)) + warn: $this > (($status >= $WARNING) ? (75) : (80)) + crit: $this > (($status == $CRITICAL) ? (85) : (90)) delay: down 5m multiplier 1.5 max 1h - info: the number of active processes - to: sysadmin - - alarm: active_processes_limit - on: system.active_processes - os: linux - hosts: * - calc: $active - units: processes - every: 5s - warn: $this > (($status >= $WARNING) ? (25000) : (26000)) - crit: $this > (($status == $CRITICAL) ? (28000) : (30000)) - delay: down 5m multiplier 1.5 max 1h - info: number of active processes + info: the percentage of active processes to: sysadmin diff --git a/health/health.d/pulsar.conf b/health/health.d/pulsar.conf new file mode 100644 index 000000000..014789451 --- /dev/null +++ b/health/health.d/pulsar.conf @@ -0,0 +1,13 @@ + +# Availability + +template: pulsar_last_collected_secs + on: pulsar.broker_components + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf index 15e8e8464..0a71dac84 100644 --- a/health/health.d/ram.conf +++ b/health/health.d/ram.conf @@ -5,7 +5,7 @@ on: system.ram os: linux freebsd hosts: * - calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz) + calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz - $zfs.arc_size.min) every: 10s info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC) @@ -14,7 +14,7 @@ os: linux hosts: * # calc: $used * 100 / ($used + $cached + $free) - calc: ($used - $used_ram_to_ignore) * 100 / ($used - $used_ram_to_ignore + $cached + $free) + calc: ($used - $used_ram_to_ignore) * 100 / ($used + $cached + $free) units: % every: 10s warn: $this > (($status >= $WARNING) ? (80) : (90)) diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf new file mode 100644 index 000000000..1a3088a2a --- /dev/null +++ b/health/health.d/scaleio.conf @@ -0,0 +1,38 @@ + +# make sure scaleio is running + +template: scaleio_last_collected_secs + on: scaleio.system_capacity_total + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# make sure Storage Pool capacity utilization is under limit + +template: scaleio_storage_pool_capacity_utilization + on: scaleio.storage_pool_capacity_utilization + calc: $used + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: Storage Pool capacity utilization + to: sysadmin + + +# make sure Sdc is connected to MDM + +template: scaleio_sdc_mdm_connection_state + on: scaleio.sdc_mdm_connection_state + calc: $connected + every: 10s + warn: $this != 1 + delay: up 30s down 5m multiplier 1.5 max 1h + info: Sdc connection to MDM state + to: sysadmin diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf index ff3648626..f835f2aee 100644 --- a/health/health.d/softnet.conf +++ b/health/health.d/softnet.conf @@ -10,7 +10,7 @@ lookup: average -1m unaligned absolute of dropped units: packets every: 10s - warn: $this > (($status >= $WARNING) ? (0) : (10) + warn: $this > (($status >= $WARNING) ? (0) : (10)) delay: down 1h multiplier 1.5 max 2h info: average number of packets dropped in the last 1min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets) to: sysadmin diff --git a/health/health.d/unbound.conf b/health/health.d/unbound.conf new file mode 100644 index 000000000..bdedc11a0 --- /dev/null +++ b/health/health.d/unbound.conf @@ -0,0 +1,35 @@ + +# make sure unbound is running + +template: unbound_last_collected_secs + on: unbound.queries + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# make sure there is no overwritten/dropped queries in the request-list + +template: unbound_request_list_overwritten + on: unbound.request_list_jostle_list + lookup: average -60s unaligned absolute match-names of overwritten + units: queries + every: 10s + warn: $this > 5 + delay: up 10 down 5m multiplier 1.5 max 1h + info: the number of overwritten queries in the request-list + to: sysadmin + +template: unbound_request_list_dropped + on: unbound.request_list_jostle_list + lookup: average -60s unaligned absolute match-names of dropped + units: queries + every: 10s + warn: $this > 0 + delay: up 10 down 5m multiplier 1.5 max 1h + info: the number of dropped queries in the request-list + to: sysadmin diff --git a/health/health.d/vernemq.conf b/health/health.d/vernemq.conf new file mode 100644 index 000000000..36bbaf82b --- /dev/null +++ b/health/health.d/vernemq.conf @@ -0,0 +1,399 @@ + +# Availability + +template: vernemq_last_collected_secs + on: vernemq.node_uptime + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# Socket errors + +template: vernemq_socket_errors + on: vernemq.socket_errors + lookup: sum -1m unaligned absolute of socket_error + units: errors + every: 10s + warn: $this > (($status == $WARNING) ? (0) : (5)) + delay: down 5m multiplier 1.5 max 2h + info: socket errors in the last minute + to: sysadmin + +# Queues dropped/expired/unhandled PUBLISH messages + +template: vernemq_queue_message_drop + on: vernemq.queue_undelivered_messages + lookup: sum -1m unaligned absolute of queue_message_drop + units: dropped messages + every: 10s + warn: $this > (($status == $WARNING) ? (0) : (5)) + delay: down 5m multiplier 1.5 max 2h + info: dropped messaged due to full queues in the last minute + to: sysadmin + +template: vernemq_queue_message_expired + on: vernemq.queue_undelivered_messages + lookup: sum -1m unaligned absolute of queue_message_expired + units: expired messages + every: 10s + warn: $this > (($status == $WARNING) ? (0) : (15)) + delay: down 5m multiplier 1.5 max 2h + info: messages which expired before delivery in the last minute + to: sysadmin + +template: vernemq_queue_message_unhandled + on: vernemq.queue_undelivered_messages + lookup: sum -1m unaligned absolute of queue_message_unhandled + units: unhandled messages + every: 10s + warn: $this > (($status == $WARNING) ? (0) : (5)) + delay: down 5m multiplier 1.5 max 2h + info: unhandled messages (connections with clean session=true) in the last minute + to: sysadmin + +# Erlang VM + +template: vernemq_average_scheduler_utilization + on: vernemq.average_scheduler_utilization + lookup: average -10m unaligned + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + info: average scheduler utilization for the last 10 minutes + to: sysadmin + +# Cluster communication and netsplits + +template: vernemq_cluster_dropped + on: vernemq.cluster_dropped + lookup: average -1m unaligned + units: KiB/s + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 1h + info: the amount of traffic dropped during communication with the cluster nodes in the last minute + to: sysadmin + +template: vernemq_netsplits + on: vernemq.netsplits + lookup: sum -1m unaligned absolute of netsplit_detected + units: netsplits + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: detected netsplits in the last minute + to: sysadmin + +# Unsuccessful CONNACK + +template: vernemq_mqtt_connack_sent_reason_success + on: vernemq.mqtt_connack_sent_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v3/v5 CONNACK sent in the last minute + to: sysadmin + +template: vernemq_mqtt_connack_sent_reason_unsuccessful + on: vernemq.mqtt_connack_sent_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_connack_sent_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v3/v5 CONNACK sent in the last minute + to: sysadmin + +# Not normal DISCONNECT + +template: vernemq_mqtt_disconnect_received_reason_normal_disconnect + on: vernemq.mqtt_disconnect_received_reason + lookup: sum -1m unaligned absolute match-names of normal_disconnect + units: packets + every: 10s + info: normal v5 DISCONNECT received in the last minute + to: sysadmin + +template: vernemq_mqtt_disconnect_sent_reason_normal_disconnect + on: vernemq.mqtt_disconnect_sent_reason + lookup: sum -1m unaligned absolute match-names of normal_disconnect + units: packets + every: 10s + info: normal v5 DISCONNECT sent in the last minute + to: sysadmin + +template: vernemq_mqtt_disconnect_received_reason_not_normal + on: vernemq.mqtt_disconnect_received_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_disconnect_received_reason_normal_disconnect + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: not normal v5 DISCONNECT received in the last minute + to: sysadmin + +template: vernemq_mqtt_disconnect_sent_reason_not_normal + on: vernemq.mqtt_disconnect_sent_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_disconnect_sent_reason_normal_disconnect + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: not normal v5 DISCONNECT sent in the last minute + to: sysadmin + +# SUBSCRIBE errors and unauthorized attempts + +template: vernemq_mqtt_subscribe_error + on: vernemq.mqtt_subscribe_error + lookup: sum -1m unaligned absolute + units: failed ops + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: failed v3/v5 SUBSCRIBE operations in the last minute + to: sysadmin + +template: vernemq_mqtt_subscribe_auth_error + on: vernemq.mqtt_subscribe_auth_error + lookup: sum -1m unaligned absolute + units: attempts + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unauthorized v3/v5 SUBSCRIBE attempts in the last minute + to: sysadmin + +# UNSUBSCRIBE errors + +template: vernemq_mqtt_unsubscribe_error + on: vernemq.mqtt_unsubscribe_error + lookup: sum -1m unaligned absolute + units: failed ops + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: failed v3/v5 UNSUBSCRIBE operations in the last minute + to: sysadmin + +# PUBLISH errors and unauthorized attempts + +template: vernemq_mqtt_publish_errors + on: vernemq.mqtt_publish_errors + lookup: sum -1m unaligned absolute + units: failed ops + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: failed v3/v5 PUBLISH operations in the last minute + to: sysadmin + +template: vernemq_mqtt_publish_auth_errors + on: vernemq.mqtt_publish_auth_errors + lookup: sum -1m unaligned absolute + units: attempts + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unauthorized v3/v5 PUBLISH attempts in the last minute + to: sysadmin + +# Unsuccessful and unexpected PUBACK + +template: vernemq_mqtt_puback_received_reason_success + on: vernemq.mqtt_puback_received_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBACK received in the last minute + to: sysadmin + +template: vernemq_mqtt_puback_sent_reason_success + on: vernemq.mqtt_puback_sent_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBACK sent in the last minute + to: sysadmin + +template: vernemq_mqtt_puback_received_reason_unsuccessful + on: vernemq.mqtt_puback_received_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_puback_received_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBACK received in the last minute + to: sysadmin + +template: vernemq_mqtt_puback_sent_reason_unsuccessful + on: vernemq.mqtt_puback_sent_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_puback_sent_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBACK sent in the last minute + to: sysadmin + +template: vernemq_mqtt_puback_unexpected + on: vernemq.mqtt_puback_invalid_error + lookup: sum -1m unaligned absolute + units: messages + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unexpected v3/v5 PUBACK received in the last minute + to: sysadmin + +# Unsuccessful and unexpected PUBREC + +template: vernemq_mqtt_pubrec_received_reason_success + on: vernemq.mqtt_pubrec_received_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBREC received in the last minute + to: sysadmin + +template: vernemq_mqtt_pubrec_sent_reason_success + on: vernemq.mqtt_pubrec_sent_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBREC sent in the last minute + to: sysadmin + +template: vernemq_mqtt_pubrec_received_reason_unsuccessful + on: vernemq.mqtt_pubrec_received_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_pubrec_received_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBREC received in the last minute + to: sysadmin + +template: vernemq_mqtt_pubrec_sent_reason_unsuccessful + on: vernemq.mqtt_pubrec_sent_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_pubrec_sent_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBREC sent in the last minute + to: sysadmin + +template: vernemq_mqtt_pubrec_invalid_error + on: vernemq.mqtt_pubrec_invalid_error + lookup: sum -1m unaligned absolute + units: messages + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unexpected v3 PUBREC received in the last minute + to: sysadmin + +# Unsuccessful PUBREL + +template: vernemq_mqtt_pubrel_received_reason_success + on: vernemq.mqtt_pubrel_received_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBREL received in the last minute + to: sysadmin + +template: vernemq_mqtt_pubrel_sent_reason_success + on: vernemq.mqtt_pubrel_sent_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBREL sent in the last minute + to: sysadmin + +template: vernemq_mqtt_pubrel_received_reason_unsuccessful + on: vernemq.mqtt_pubrel_received_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_pubrel_received_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBREL received in the last minute + to: sysadmin + +template: vernemq_mqtt_pubrel_sent_reason_unsuccessful + on: vernemq.mqtt_pubrel_sent_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_pubrel_sent_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBREL sent in the last minute + to: sysadmin + +# Unsuccessful and unexpected PUBCOMP + +template: vernemq_mqtt_pubcomp_received_reason_success + on: vernemq.mqtt_pubcomp_received_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBCOMP received in the last minute + to: sysadmin + +template: vernemq_mqtt_pubcomp_sent_reason_success + on: vernemq.mqtt_pubcomp_sent_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBCOMP sent in the last minute + to: sysadmin + +template: vernemq_mqtt_pubcomp_received_reason_unsuccessful + on: vernemq.mqtt_pubcomp_received_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_pubcomp_received_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBCOMP received in the last minute + to: sysadmin + +template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful + on: vernemq.mqtt_pubcomp_sent_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_pubcomp_sent_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBCOMP sent in the last minute + to: sysadmin + +template: vernemq_mqtt_pubcomp_unexpected + on: vernemq.mqtt_pubcomp_invalid_error + lookup: sum -1m unaligned absolute + units: messages + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unexpected v3/v5 PUBCOMP received in the last minute + to: sysadmin diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf index 1aefd7b00..44de38a48 100644 --- a/health/health.d/web_log.conf +++ b/health/health.d/web_log.conf @@ -111,7 +111,6 @@ families: * units: % every: 10s warn: ($1m_total_requests > 120) ? ($this > 1) : ( 0 ) - crit: ($1m_total_requests > 120) ? ($this > 5) : ( 0 ) delay: up 1m down 5m multiplier 1.5 max 1h info: the ratio of unmatched lines, over the last minute to: webmaster @@ -235,7 +234,6 @@ families: * units: % every: 10s warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 ) - crit: ($web_log_1m_total_requests > 120) ? ($this > 5) : ( 0 ) delay: up 1m down 5m multiplier 1.5 max 1h info: the ratio of unmatched lines, over the last minute to: webmaster diff --git a/health/health.d/whoisquery.conf b/health/health.d/whoisquery.conf new file mode 100644 index 000000000..275e11dd9 --- /dev/null +++ b/health/health.d/whoisquery.conf @@ -0,0 +1,24 @@ + +# make sure whoisquery is running + +template: whoisquery_last_collected_secs + on: whoisquery.time_until_expiration + calc: $now - $last_collected_t + units: seconds ago + every: 60s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + + +template: whoisquery_days_until_expiration + on: whoisquery.time_until_expiration + calc: $expiry + units: seconds + every: 60s + warn: $this < $days_until_expiration_warning*24*60*60 + crit: $this < $days_until_expiration_critical*24*60*60 + info: domain time until expiration + to: webmaster diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf index a56f48fc3..dfca37706 100644 --- a/health/health.d/x509check.conf +++ b/health/health.d/x509check.conf @@ -22,3 +22,11 @@ template: x509check_days_until_expiration crit: $this < $days_until_expiration_critical*24*60*60 info: certificate time until expiration to: webmaster + +template: x509check_revocation_status + on: x509check.revocation_status + calc: $revoked + every: 60s + crit: $this != nan AND $this != 0 + info: certificate revocation status + to: webmaster diff --git a/health/health.h b/health/health.h index ab367e903..5281e16e3 100644 --- a/health/health.h +++ b/health/health.h @@ -24,6 +24,7 @@ extern unsigned int default_health_enabled; #define HEALTH_ENTRY_FLAG_EXEC_FAILED 0x00000008 #define HEALTH_ENTRY_FLAG_SILENCED 0x00000010 #define HEALTH_ENTRY_RUN_ONCE 0x00000020 +#define HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS 0x00000040 #define HEALTH_ENTRY_FLAG_SAVED 0x10000000 #define HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION 0x80000000 @@ -52,7 +53,7 @@ extern unsigned int default_health_enabled; #define HEALTH_SILENCERS_MAX_FILE_LEN 10000 -char *silencers_filename; +extern char *silencers_filename; extern void health_init(void); extern void *health_main(void *ptr); @@ -62,16 +63,14 @@ extern void health_reload(void); extern int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result); extern void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* context, RRDCALC_STATUS status); extern void health_alarms2json(RRDHOST *host, BUFFER *wb, int all); +extern void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all); extern void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after); void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf); void health_api_v1_chart_custom_variables2json(RRDSET *st, BUFFER *buf); extern int health_alarm_log_open(RRDHOST *host); -extern void health_alarm_log_close(RRDHOST *host); -extern void health_log_rotate(RRDHOST *host); extern void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae); -extern ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename); extern void health_alarm_log_load(RRDHOST *host); extern ALARM_ENTRY* health_create_alarm_entry( @@ -100,13 +99,14 @@ extern void health_alarm_log(RRDHOST *host, ALARM_ENTRY *ae); extern void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path, const char *subpath); extern char *health_user_config_dir(void); extern char *health_stock_config_dir(void); -extern void health_reload_host(RRDHOST *host); extern void health_alarm_log_free(RRDHOST *host); extern void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae); extern void *health_cmdapi_thread(void *ptr); +extern void health_label_log_save(RRDHOST *host); + extern SIMPLE_PATTERN *health_pattern_from_foreach(char *s); #endif //NETDATA_HEALTH_H diff --git a/health/health_config.c b/health/health_config.c index 65c6d8bd7..a200a0dbf 100644 --- a/health/health_config.c +++ b/health/health_config.c @@ -10,6 +10,8 @@ #define HEALTH_HOST_KEY "hosts" #define HEALTH_OS_KEY "os" #define HEALTH_FAMILIES_KEY "families" +#define HEALTH_PLUGIN_KEY "plugin" +#define HEALTH_MODULE_KEY "module" #define HEALTH_LOOKUP_KEY "lookup" #define HEALTH_CALC_KEY "calc" #define HEALTH_EVERY_KEY "every" @@ -24,6 +26,7 @@ #define HEALTH_DELAY_KEY "delay" #define HEALTH_OPTIONS_KEY "options" #define HEALTH_REPEAT_KEY "repeat" +#define HEALTH_HOST_LABEL_KEY "host labels" static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) { if(!rc->chart) { @@ -484,6 +487,8 @@ static int health_readfile(const char *filename, void *data) { hash_on = 0, hash_host = 0, hash_families = 0, + hash_plugin = 0, + hash_module = 0, hash_calc = 0, hash_green = 0, hash_red = 0, @@ -497,7 +502,8 @@ static int health_readfile(const char *filename, void *data) { hash_recipient = 0, hash_delay = 0, hash_options = 0, - hash_repeat = 0; + hash_repeat = 0, + hash_host_label = 0; char buffer[HEALTH_CONF_MAX_LINE + 1]; @@ -508,6 +514,8 @@ static int health_readfile(const char *filename, void *data) { hash_os = simple_uhash(HEALTH_OS_KEY); hash_host = simple_uhash(HEALTH_HOST_KEY); hash_families = simple_uhash(HEALTH_FAMILIES_KEY); + hash_plugin = simple_uhash(HEALTH_PLUGIN_KEY); + hash_module = simple_uhash(HEALTH_MODULE_KEY); hash_calc = simple_uhash(HEALTH_CALC_KEY); hash_lookup = simple_uhash(HEALTH_LOOKUP_KEY); hash_green = simple_uhash(HEALTH_GREEN_KEY); @@ -522,6 +530,7 @@ static int health_readfile(const char *filename, void *data) { hash_delay = simple_uhash(HEALTH_DELAY_KEY); hash_options = simple_uhash(HEALTH_OPTIONS_KEY); hash_repeat = simple_uhash(HEALTH_REPEAT_KEY); + hash_host_label = simple_uhash(HEALTH_HOST_LABEL_KEY); } FILE *fp = fopen(filename, "r"); @@ -795,6 +804,33 @@ static int health_readfile(const char *filename, void *data) { &rc->warn_repeat_every, &rc->crit_repeat_every); } + else if(hash == hash_host_label && !strcasecmp(key, HEALTH_HOST_LABEL_KEY)) { + if(rc->labels) { + if(strcmp(rc->labels, value) != 0) + error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'.", + line, filename, rc->name, key, value, value); + + freez(rc->labels); + simple_pattern_free(rc->splabels); + } + + rc->labels = simple_pattern_trim_around_equal(value); + rc->splabels = simple_pattern_create(rc->labels, NULL, SIMPLE_PATTERN_EXACT); + } + else if(hash == hash_plugin && !strcasecmp(key, HEALTH_PLUGIN_KEY)) { + freez(rc->plugin_match); + simple_pattern_free(rc->plugin_pattern); + + rc->plugin_match = strdupz(value); + rc->plugin_pattern = simple_pattern_create(rc->plugin_match, NULL, SIMPLE_PATTERN_EXACT); + } + else if(hash == hash_module && !strcasecmp(key, HEALTH_MODULE_KEY)) { + freez(rc->module_match); + simple_pattern_free(rc->module_pattern); + + rc->module_match = strdupz(value); + rc->module_pattern = simple_pattern_create(rc->module_match, NULL, SIMPLE_PATTERN_EXACT); + } else { error("Health configuration at line %zu of file '%s' for alarm '%s' has unknown key '%s'.", line, filename, rc->name, key); @@ -819,6 +855,20 @@ static int health_readfile(const char *filename, void *data) { rt->family_match = strdupz(value); rt->family_pattern = simple_pattern_create(rt->family_match, NULL, SIMPLE_PATTERN_EXACT); } + else if(hash == hash_plugin && !strcasecmp(key, HEALTH_PLUGIN_KEY)) { + freez(rt->plugin_match); + simple_pattern_free(rt->plugin_pattern); + + rt->plugin_match = strdupz(value); + rt->plugin_pattern = simple_pattern_create(rt->plugin_match, NULL, SIMPLE_PATTERN_EXACT); + } + else if(hash == hash_module && !strcasecmp(key, HEALTH_MODULE_KEY)) { + freez(rt->module_match); + simple_pattern_free(rt->module_pattern); + + rt->module_match = strdupz(value); + rt->module_pattern = simple_pattern_create(rt->module_match, NULL, SIMPLE_PATTERN_EXACT); + } else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) { health_parse_db_lookup(line, filename, value, &rt->group, &rt->after, &rt->before, &rt->update_every, &rt->options, &rt->dimensions, &rt->foreachdim); @@ -927,6 +977,19 @@ static int health_readfile(const char *filename, void *data) { &rt->warn_repeat_every, &rt->crit_repeat_every); } + else if(hash == hash_host_label && !strcasecmp(key, HEALTH_HOST_LABEL_KEY)) { + if(rt->labels) { + if(strcmp(rt->labels, value) != 0) + error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + line, filename, rt->name, key, rt->labels, value, value); + + freez(rt->labels); + simple_pattern_free(rt->splabels); + } + + rt->labels = simple_pattern_trim_around_equal(value); + rt->splabels = simple_pattern_create(rt->labels, NULL, SIMPLE_PATTERN_EXACT); + } else { error("Health configuration at line %zu of file '%s' for template '%s' has unknown key '%s'.", line, filename, rt->name, key); diff --git a/health/health_json.c b/health/health_json.c index 8a088d034..d068b5427 100644 --- a/health/health_json.c +++ b/health/health_json.c @@ -13,7 +13,7 @@ static inline void health_string2json(BUFFER *wb, const char *prefix, const char buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix); } -static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) { +inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) { buffer_sprintf(wb, "\n\t{\n" "\t\t\"hostname\": \"%s\",\n" @@ -113,6 +113,25 @@ void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after) { netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); } +static inline void health_rrdcalc_values2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC *rc) { + (void)host; + buffer_sprintf(wb, + "\t\t\"%s.%s\": {\n" + "\t\t\t\"id\": %lu,\n" + , rc->chart, rc->name + , (unsigned long)rc->id); + + buffer_strcat(wb, "\t\t\t\"value\":"); + buffer_rrd_value(wb, rc->value); + buffer_strcat(wb, ",\n"); + + buffer_sprintf(wb, + "\t\t\t\"status\": \"%s\"\n" + , rrdcalc_status2string(rc->status)); + + buffer_strcat(wb, "\t\t}"); +} + static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC *rc) { char value_string[100 + 1]; format_value_and_unit(value_string, 100, rc->value, rc->units, -1); @@ -272,9 +291,23 @@ void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* contexts, RRDCAL rrdhost_unlock(host); } -void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) { +static void health_alarms2json_fill_alarms(RRDHOST *host, BUFFER *wb, int all, void (*fp)(RRDHOST *, BUFFER *, RRDCALC *)) { + RRDCALC *rc; int i; + for(i = 0, rc = host->alarms; rc ; rc = rc->next) { + if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) + continue; + + if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL))) + continue; + + if(likely(i)) buffer_strcat(wb, ",\n"); + fp(host, wb, rc); + i++; + } +} +void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) { rrdhost_rdlock(host); buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\"," "\n\t\"latest_alarm_log_unique_id\": %u," @@ -286,18 +319,7 @@ void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) { host->health_enabled?"true":"false", (unsigned long)now_realtime_sec()); - RRDCALC *rc; - for(i = 0, rc = host->alarms; rc ; rc = rc->next) { - if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) - continue; - - if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL))) - continue; - - if(likely(i)) buffer_strcat(wb, ",\n"); - health_rrdcalc2json_nolock(host, wb, rc); - i++; - } + health_alarms2json_fill_alarms(host, wb, all, health_rrdcalc2json_nolock); // buffer_strcat(wb, "\n\t},\n\t\"templates\": {"); // RRDCALCTEMPLATE *rt; @@ -308,5 +330,38 @@ void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) { rrdhost_unlock(host); } +void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all) { + rrdhost_rdlock(host); + buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\"," + "\n\t\"alarms\": {\n", + host->hostname); + + health_alarms2json_fill_alarms(host, wb, all, health_rrdcalc_values2json_nolock); + + buffer_strcat(wb, "\n\t}\n}\n"); + rrdhost_unlock(host); +} +void health_active_log_alarms_2json(RRDHOST *host, BUFFER *wb) { + netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); + + buffer_sprintf(wb, "[\n"); + + unsigned int max = host->health_log.max; + unsigned int count = 0; + ALARM_ENTRY *ae; + for(ae = host->health_log.alarms; ae && count < max ; ae = ae->next) { + + if(likely(!((ae->new_status == RRDCALC_STATUS_WARNING || ae->new_status == RRDCALC_STATUS_CRITICAL) + && !ae->updated_by_id))) + continue; + + if(likely(count)) buffer_strcat(wb, ","); + health_alarm_entry2json_nolock(wb, ae, host); + count++; + } + buffer_strcat(wb, "]"); + + netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); +} diff --git a/health/health_log.c b/health/health_log.c index c91cde6cb..8c0bc5c34 100644 --- a/health/health_log.c +++ b/health/health_log.c @@ -22,14 +22,14 @@ inline int health_alarm_log_open(RRDHOST *host) { return -1; } -inline void health_alarm_log_close(RRDHOST *host) { +static inline void health_alarm_log_close(RRDHOST *host) { if(host->health_log_fp) { fclose(host->health_log_fp); host->health_log_fp = NULL; } } -inline void health_log_rotate(RRDHOST *host) { +static inline void health_log_rotate(RRDHOST *host) { static size_t rotate_every = 0; if(unlikely(rotate_every == 0)) { @@ -67,9 +67,40 @@ inline void health_log_rotate(RRDHOST *host) { } } -inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { +inline void health_label_log_save(RRDHOST *host) { health_log_rotate(host); + if(likely(host->health_log_fp)) { + BUFFER *wb = buffer_create(1024); + rrdhost_check_rdlock(host); + netdata_rwlock_rdlock(&host->labels.labels_rwlock); + struct label *l=localhost->labels.head; + while (l != NULL) { + buffer_sprintf(wb,"%s=%s\t ", l->key, l->value); + l = l->next; + } + netdata_rwlock_unlock(&host->labels.labels_rwlock); + + char *write = (char *) buffer_tostring(wb) ; + + write[wb->len-2] = '\n'; + write[wb->len-1] = '\0'; + + if (unlikely(fprintf(host->health_log_fp, "L\t%s" + , write + ) < 0)) + error("HEALTH [%s]: failed to save alarm log entry to '%s'. Health data may be lost in case of abnormal restart.", + host->hostname, host->health_log_filename); + else { + host->health_log_entries_written++; + } + + buffer_free(wb); + } +} + +inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { + health_log_rotate(host); if(likely(host->health_log_fp)) { if(unlikely(fprintf(host->health_log_fp , "%c\t%s" @@ -121,9 +152,33 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { host->health_log_entries_written++; } } +#ifdef ENABLE_ACLK + if (netdata_cloud_setting) { + if ((ae->new_status == RRDCALC_STATUS_WARNING || ae->new_status == RRDCALC_STATUS_CRITICAL) || + ((ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL))) { + aclk_update_alarm(host, ae); + } + } +#endif +} + +static uint32_t is_valid_alarm_id(RRDHOST *host, const char *chart, const char *name, uint32_t alarm_id) +{ + uint32_t hash_chart = simple_hash(chart); + uint32_t hash_name = simple_hash(name); + + ALARM_ENTRY *ae; + for(ae = host->health_log.alarms; ae ;ae = ae->next) { + if (unlikely( + ae->alarm_id == alarm_id && (!(ae->hash_name == hash_name && ae->hash_chart == hash_chart && + !strcmp(name, ae->name) && !strcmp(chart, ae->chart))))) { + return 0; + } + } + return 1; } -inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename) { +static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename) { errno = 0; char *s, *buf = mallocz(65536 + 1); @@ -152,6 +207,9 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena else s++; } + if(likely(*pointers[0] == 'L')) + continue; + if(likely(*pointers[0] == 'U' || *pointers[0] == 'A')) { ALARM_ENTRY *ae = NULL; @@ -248,6 +306,8 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena // error("HEALTH [%s]: line %zu of file '%s' provides an alarm for host '%s' but this is named '%s'.", host->hostname, line, filename, pointers[1], host->hostname); ae->unique_id = unique_id; + if (!is_valid_alarm_id(host, pointers[14], pointers[13], alarm_id)) + alarm_id = rrdcalc_get_unique_id(host, pointers[14], pointers[13], NULL); ae->alarm_id = alarm_id; ae->alarm_event_id = (uint32_t)strtoul(pointers[4], NULL, 16); ae->updated_by_id = (uint32_t)strtoul(pointers[5], NULL, 16); @@ -338,7 +398,8 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena if(!host->health_max_alarm_id) host->health_max_alarm_id = (uint32_t)now_realtime_sec(); host->health_log.next_log_id = host->health_max_unique_id + 1; - host->health_log.next_alarm_id = host->health_max_alarm_id + 1; + if (unlikely(!host->health_log.next_alarm_id || host->health_log.next_alarm_id <= host->health_max_alarm_id)) + host->health_log.next_alarm_id = host->health_max_alarm_id + 1; debug(D_HEALTH, "HEALTH [%s]: loaded file '%s' with %zd new alarm entries, updated %zd alarms, errors %zd entries, duplicate %zd", host->hostname, filename, loaded, updated, errored, duplicate); return loaded; diff --git a/health/notifications/Makefile.am b/health/notifications/Makefile.am index 606770fd0..e6b42138e 100644 --- a/health/notifications/Makefile.am +++ b/health/notifications/Makefile.am @@ -35,14 +35,17 @@ include hangouts/Makefile.inc include irc/Makefile.inc include kavenegar/Makefile.inc include messagebird/Makefile.inc +include opsgenie/Makefile.inc include pagerduty/Makefile.inc include pushbullet/Makefile.inc include pushover/Makefile.inc include rocketchat/Makefile.inc include slack/Makefile.inc include smstools3/Makefile.inc +include stackpulse/Makefile.inc include syslog/Makefile.inc include telegram/Makefile.inc include twilio/Makefile.inc include web/Makefile.inc +include matrix/Makefile.inc include custom/Makefile.inc diff --git a/health/notifications/Makefile.in b/health/notifications/Makefile.in deleted file mode 100644 index 58a117ac9..000000000 --- a/health/notifications/Makefile.in +++ /dev/null @@ -1,835 +0,0 @@ -# Makefile.in generated by automake 1.15.1 from Makefile.am. -# @configure_input@ - -# Copyright (C) 1994-2017 Free Software Foundation, Inc. - -# This Makefile.in is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY, to the extent permitted by law; without -# even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. - -@SET_MAKE@ - -# SPDX-License-Identifier: GPL-3.0-or-later - -# SPDX-License-Identifier: GPL-3.0-or-later - -# THIS IS NOT A COMPLETE Makefile -# IT IS INCLUDED BY ITS PARENT'S Makefile.am -# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT - -# SPDX-License-Identifier: GPL-3.0-or-later - -# THIS IS NOT A COMPLETE Makefile -# IT IS INCLUDED BY ITS PARENT'S Makefile.am -# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT - -# SPDX-License-Identifier: GPL-3.0-or-later - -# THIS IS NOT A COMPLETE Makefile -# IT IS INCLUDED BY ITS PARENT'S Makefile.am -# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT - -# SPDX-License-Identifier: GPL-3.0-or-later - -# THIS IS NOT A COMPLETE Makefile -# IT IS INCLUDED BY ITS PARENT'S Makefile.am -# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT - -# SPDX-License-Identifier: GPL-3.0-or-later - -# THIS IS NOT A COMPLETE Makefile -# IT IS INCLUDED BY ITS PARENT'S Makefile.am -# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT - -# SPDX-License-Identifier: GPL-3.0-or-later - -# THIS IS NOT A COMPLETE Makefile -# IT IS INCLUDED BY ITS PARENT'S Makefile.am -# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT - -# SPDX-License-Identifier: GPL-3.0-or-later - -# THIS IS NOT A COMPLETE Makefile -# IT IS INCLUDED BY ITS PARENT'S Makefile.am -# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT - -# SPDX-License-Identifier: GPL-3.0-or-later - -# THIS IS NOT A COMPLETE Makefile -# IT IS INCLUDED BY ITS PARENT'S Makefile.am -# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT - -# SPDX-License-Identifier: GPL-3.0-or-later - -# THIS IS NOT A COMPLETE Makefile -# IT IS INCLUDED BY ITS PARENT'S Makefile.am -# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT - -# SPDX-License-Identifier: GPL-3.0-or-later - -# THIS IS NOT A COMPLETE Makefile -# IT IS INCLUDED BY ITS PARENT'S Makefile.am -# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT - -# SPDX-License-Identifier: GPL-3.0-or-later - -# THIS IS NOT A COMPLETE Makefile -# IT IS INCLUDED BY ITS PARENT'S Makefile.am -# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT - -# SPDX-License-Identifier: GPL-3.0-or-later - -# THIS IS NOT A COMPLETE Makefile -# IT IS INCLUDED BY ITS PARENT'S Makefile.am -# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT - -# SPDX-License-Identifier: GPL-3.0-or-later - -# THIS IS NOT A COMPLETE Makefile -# IT IS INCLUDED BY ITS PARENT'S Makefile.am -# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT - -# SPDX-License-Identifier: GPL-3.0-or-later - -# THIS IS NOT A COMPLETE Makefile -# IT IS INCLUDED BY ITS PARENT'S Makefile.am -# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT - -# SPDX-License-Identifier: GPL-3.0-or-later - -# THIS IS NOT A COMPLETE Makefile -# IT IS INCLUDED BY ITS PARENT'S Makefile.am -# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT - -# SPDX-License-Identifier: GPL-3.0-or-later - -# THIS IS NOT A COMPLETE Makefile -# IT IS INCLUDED BY ITS PARENT'S Makefile.am -# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT - -# SPDX-License-Identifier: GPL-3.0-or-later - -# THIS IS NOT A COMPLETE Makefile -# IT IS INCLUDED BY ITS PARENT'S Makefile.am -# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT - -# SPDX-License-Identifier: GPL-3.0-or-later - -# THIS IS NOT A COMPLETE Makefile -# IT IS INCLUDED BY ITS PARENT'S Makefile.am -# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT - -# SPDX-License-Identifier: GPL-3.0-or-later - -# THIS IS NOT A COMPLETE Makefile -# IT IS INCLUDED BY ITS PARENT'S Makefile.am -# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT - -# SPDX-License-Identifier: GPL-3.0-or-later - -# THIS IS NOT A COMPLETE Makefile -# IT IS INCLUDED BY ITS PARENT'S Makefile.am -# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT - - -VPATH = @srcdir@ -am__is_gnu_make = { \ - if test -z '$(MAKELEVEL)'; then \ - false; \ - elif test -n '$(MAKE_HOST)'; then \ - true; \ - elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ - true; \ - else \ - false; \ - fi; \ -} -am__make_running_with_option = \ - case $${target_option-} in \ - ?) ;; \ - *) echo "am__make_running_with_option: internal error: invalid" \ - "target option '$${target_option-}' specified" >&2; \ - exit 1;; \ - esac; \ - has_opt=no; \ - sane_makeflags=$$MAKEFLAGS; \ - if $(am__is_gnu_make); then \ - sane_makeflags=$$MFLAGS; \ - else \ - case $$MAKEFLAGS in \ - *\\[\ \ ]*) \ - bs=\\; \ - sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ - | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ - esac; \ - fi; \ - skip_next=no; \ - strip_trailopt () \ - { \ - flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ - }; \ - for flg in $$sane_makeflags; do \ - test $$skip_next = yes && { skip_next=no; continue; }; \ - case $$flg in \ - *=*|--*) continue;; \ - -*I) strip_trailopt 'I'; skip_next=yes;; \ - -*I?*) strip_trailopt 'I';; \ - -*O) strip_trailopt 'O'; skip_next=yes;; \ - -*O?*) strip_trailopt 'O';; \ - -*l) strip_trailopt 'l'; skip_next=yes;; \ - -*l?*) strip_trailopt 'l';; \ - -[dEDm]) skip_next=yes;; \ - -[JT]) skip_next=yes;; \ - esac; \ - case $$flg in \ - *$$target_option*) has_opt=yes; break;; \ - esac; \ - done; \ - test $$has_opt = yes -am__make_dryrun = (target_option=n; $(am__make_running_with_option)) -am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) -pkgdatadir = $(datadir)/@PACKAGE@ -pkgincludedir = $(includedir)/@PACKAGE@ -pkglibdir = $(libdir)/@PACKAGE@ -pkglibexecdir = $(libexecdir)/@PACKAGE@ -am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd -install_sh_DATA = $(install_sh) -c -m 644 -install_sh_PROGRAM = $(install_sh) -c -install_sh_SCRIPT = $(install_sh) -c -INSTALL_HEADER = $(INSTALL_DATA) -transform = $(program_transform_name) -NORMAL_INSTALL = : -PRE_INSTALL = : -POST_INSTALL = : -NORMAL_UNINSTALL = : -PRE_UNINSTALL = : -POST_UNINSTALL = : -build_triplet = @build@ -host_triplet = @host@ -subdir = health/notifications -ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 -am__aclocal_m4_deps = $(top_srcdir)/build/m4/ax_c___atomic.m4 \ - $(top_srcdir)/build/m4/ax_c__generic.m4 \ - $(top_srcdir)/build/m4/ax_c_lto.m4 \ - $(top_srcdir)/build/m4/ax_c_mallinfo.m4 \ - $(top_srcdir)/build/m4/ax_c_mallopt.m4 \ - $(top_srcdir)/build/m4/ax_check_compile_flag.m4 \ - $(top_srcdir)/build/m4/ax_gcc_func_attribute.m4 \ - $(top_srcdir)/build/m4/ax_pthread.m4 \ - $(top_srcdir)/build/m4/jemalloc.m4 \ - $(top_srcdir)/build/m4/tcmalloc.m4 $(top_srcdir)/configure.ac -am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ - $(ACLOCAL_M4) -DIST_COMMON = $(srcdir)/Makefile.am $(dist_plugins_SCRIPTS) \ - $(dist_libconfig_DATA) $(dist_noinst_DATA) $(am__DIST_COMMON) -mkinstalldirs = $(install_sh) -d -CONFIG_HEADER = $(top_builddir)/config.h -CONFIG_CLEAN_FILES = -CONFIG_CLEAN_VPATH_FILES = -am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; -am__vpath_adj = case $$p in \ - $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ - *) f=$$p;; \ - esac; -am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; -am__install_max = 40 -am__nobase_strip_setup = \ - srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` -am__nobase_strip = \ - for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" -am__nobase_list = $(am__nobase_strip_setup); \ - for p in $$list; do echo "$$p $$p"; done | \ - sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ - $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ - if (++n[$$2] == $(am__install_max)) \ - { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ - END { for (dir in files) print dir, files[dir] }' -am__base_list = \ - sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ - sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' -am__uninstall_files_from_dir = { \ - test -z "$$files" \ - || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ - || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ - $(am__cd) "$$dir" && rm -f $$files; }; \ - } -am__installdirs = "$(DESTDIR)$(pluginsdir)" \ - "$(DESTDIR)$(libconfigdir)" -SCRIPTS = $(dist_plugins_SCRIPTS) -AM_V_P = $(am__v_P_@AM_V@) -am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) -am__v_P_0 = false -am__v_P_1 = : -AM_V_GEN = $(am__v_GEN_@AM_V@) -am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) -am__v_GEN_0 = @echo " GEN " $@; -am__v_GEN_1 = -AM_V_at = $(am__v_at_@AM_V@) -am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) -am__v_at_0 = @ -am__v_at_1 = -SOURCES = -DIST_SOURCES = -am__can_run_installinfo = \ - case $$AM_UPDATE_INFO_DIR in \ - n|no|NO) false;; \ - *) (install-info --version) >/dev/null 2>&1;; \ - esac -DATA = $(dist_libconfig_DATA) $(dist_noinst_DATA) -am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) -am__DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/alerta/Makefile.inc \ - $(srcdir)/awssns/Makefile.inc $(srcdir)/custom/Makefile.inc \ - $(srcdir)/discord/Makefile.inc $(srcdir)/email/Makefile.inc \ - $(srcdir)/flock/Makefile.inc $(srcdir)/hangouts/Makefile.inc \ - $(srcdir)/irc/Makefile.inc $(srcdir)/kavenegar/Makefile.inc \ - $(srcdir)/messagebird/Makefile.inc \ - $(srcdir)/pagerduty/Makefile.inc \ - $(srcdir)/pushbullet/Makefile.inc \ - $(srcdir)/pushover/Makefile.inc \ - $(srcdir)/rocketchat/Makefile.inc $(srcdir)/slack/Makefile.inc \ - $(srcdir)/smstools3/Makefile.inc $(srcdir)/syslog/Makefile.inc \ - $(srcdir)/telegram/Makefile.inc $(srcdir)/twilio/Makefile.inc \ - $(srcdir)/web/Makefile.inc $(top_srcdir)/build/subst.inc -DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) -ACLOCAL = @ACLOCAL@ -AMTAR = @AMTAR@ -AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ -AUTOCONF = @AUTOCONF@ -AUTOHEADER = @AUTOHEADER@ -AUTOMAKE = @AUTOMAKE@ -AWK = @AWK@ -CC = @CC@ -CCDEPMODE = @CCDEPMODE@ -CFLAGS = @CFLAGS@ -CMOCKA_CFLAGS = @CMOCKA_CFLAGS@ -CMOCKA_LIBS = @CMOCKA_LIBS@ -CPP = @CPP@ -CPPFLAGS = @CPPFLAGS@ -CUPSCONFIG = @CUPSCONFIG@ -CXX = @CXX@ -CXXDEPMODE = @CXXDEPMODE@ -CXXFLAGS = @CXXFLAGS@ -CXX_BINARY = @CXX_BINARY@ -CYGPATH_W = @CYGPATH_W@ -DEFS = @DEFS@ -DEPDIR = @DEPDIR@ -ECHO_C = @ECHO_C@ -ECHO_N = @ECHO_N@ -ECHO_T = @ECHO_T@ -EGREP = @EGREP@ -ENABLE_UNITTESTS = @ENABLE_UNITTESTS@ -EXEEXT = @EXEEXT@ -GREP = @GREP@ -INSTALL = @INSTALL@ -INSTALL_DATA = @INSTALL_DATA@ -INSTALL_PROGRAM = @INSTALL_PROGRAM@ -INSTALL_SCRIPT = @INSTALL_SCRIPT@ -INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ -IPMIMONITORING_CFLAGS = @IPMIMONITORING_CFLAGS@ -IPMIMONITORING_LIBS = @IPMIMONITORING_LIBS@ -JSON_CFLAGS = @JSON_CFLAGS@ -JSON_LIBS = @JSON_LIBS@ -LDFLAGS = @LDFLAGS@ -LIBCAP_CFLAGS = @LIBCAP_CFLAGS@ -LIBCAP_LIBS = @LIBCAP_LIBS@ -LIBCRYPTO_CFLAGS = @LIBCRYPTO_CFLAGS@ -LIBCRYPTO_LIBS = @LIBCRYPTO_LIBS@ -LIBCURL_CFLAGS = @LIBCURL_CFLAGS@ -LIBCURL_LIBS = @LIBCURL_LIBS@ -LIBMNL_CFLAGS = @LIBMNL_CFLAGS@ -LIBMNL_LIBS = @LIBMNL_LIBS@ -LIBMONGOC_CFLAGS = @LIBMONGOC_CFLAGS@ -LIBMONGOC_LIBS = @LIBMONGOC_LIBS@ -LIBOBJS = @LIBOBJS@ -LIBS = @LIBS@ -LIBSSL_CFLAGS = @LIBSSL_CFLAGS@ -LIBSSL_LIBS = @LIBSSL_LIBS@ -LTLIBOBJS = @LTLIBOBJS@ -MAINT = @MAINT@ -MAKEINFO = @MAKEINFO@ -MATH_CFLAGS = @MATH_CFLAGS@ -MATH_LIBS = @MATH_LIBS@ -MKDIR_P = @MKDIR_P@ -NFACCT_CFLAGS = @NFACCT_CFLAGS@ -NFACCT_LIBS = @NFACCT_LIBS@ -OBJEXT = @OBJEXT@ -OPTIONAL_CUPS_CFLAGS = @OPTIONAL_CUPS_CFLAGS@ -OPTIONAL_CUPS_LIBS = @OPTIONAL_CUPS_LIBS@ -OPTIONAL_IPMIMONITORING_CFLAGS = @OPTIONAL_IPMIMONITORING_CFLAGS@ -OPTIONAL_IPMIMONITORING_LIBS = @OPTIONAL_IPMIMONITORING_LIBS@ -OPTIONAL_JSONC_LIBS = @OPTIONAL_JSONC_LIBS@ -OPTIONAL_JUDY_LIBS = @OPTIONAL_JUDY_LIBS@ -OPTIONAL_KINESIS_CFLAGS = @OPTIONAL_KINESIS_CFLAGS@ -OPTIONAL_KINESIS_LIBS = @OPTIONAL_KINESIS_LIBS@ -OPTIONAL_LIBCAP_CFLAGS = @OPTIONAL_LIBCAP_CFLAGS@ -OPTIONAL_LIBCAP_LIBS = @OPTIONAL_LIBCAP_LIBS@ -OPTIONAL_LZ4_LIBS = @OPTIONAL_LZ4_LIBS@ -OPTIONAL_MATH_CFLAGS = @OPTIONAL_MATH_CFLAGS@ -OPTIONAL_MATH_LIBS = @OPTIONAL_MATH_LIBS@ -OPTIONAL_MONGOC_CFLAGS = @OPTIONAL_MONGOC_CFLAGS@ -OPTIONAL_MONGOC_LIBS = @OPTIONAL_MONGOC_LIBS@ -OPTIONAL_NFACCT_CFLAGS = @OPTIONAL_NFACCT_CFLAGS@ -OPTIONAL_NFACCT_LIBS = @OPTIONAL_NFACCT_LIBS@ -OPTIONAL_PROMETHEUS_REMOTE_WRITE_CFLAGS = @OPTIONAL_PROMETHEUS_REMOTE_WRITE_CFLAGS@ -OPTIONAL_PROMETHEUS_REMOTE_WRITE_LIBS = @OPTIONAL_PROMETHEUS_REMOTE_WRITE_LIBS@ -OPTIONAL_SSL_LIBS = @OPTIONAL_SSL_LIBS@ -OPTIONAL_UUID_CFLAGS = @OPTIONAL_UUID_CFLAGS@ -OPTIONAL_UUID_LIBS = @OPTIONAL_UUID_LIBS@ -OPTIONAL_UV_LIBS = @OPTIONAL_UV_LIBS@ -OPTIONAL_XENSTAT_CFLAGS = @OPTIONAL_XENSTAT_CFLAGS@ -OPTIONAL_XENSTAT_LIBS = @OPTIONAL_XENSTAT_LIBS@ -OPTIONAL_ZLIB_CFLAGS = @OPTIONAL_ZLIB_CFLAGS@ -OPTIONAL_ZLIB_LIBS = @OPTIONAL_ZLIB_LIBS@ -PACKAGE = @PACKAGE@ -PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ -PACKAGE_NAME = @PACKAGE_NAME@ -PACKAGE_RPM_VERSION = @PACKAGE_RPM_VERSION@ -PACKAGE_STRING = @PACKAGE_STRING@ -PACKAGE_TARNAME = @PACKAGE_TARNAME@ -PACKAGE_URL = @PACKAGE_URL@ -PACKAGE_VERSION = @PACKAGE_VERSION@ -PATH_SEPARATOR = @PATH_SEPARATOR@ -PKG_CONFIG = @PKG_CONFIG@ -PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@ -PKG_CONFIG_PATH = @PKG_CONFIG_PATH@ -PROTOBUF_CFLAGS = @PROTOBUF_CFLAGS@ -PROTOBUF_LIBS = @PROTOBUF_LIBS@ -PROTOC = @PROTOC@ -PTHREAD_CC = @PTHREAD_CC@ -PTHREAD_CFLAGS = @PTHREAD_CFLAGS@ -PTHREAD_LIBS = @PTHREAD_LIBS@ -SET_MAKE = @SET_MAKE@ -SHELL = @SHELL@ -SSE_CANDIDATE = @SSE_CANDIDATE@ -STRIP = @STRIP@ -TEST_CFLAGS = @TEST_CFLAGS@ -TEST_LIBS = @TEST_LIBS@ -UUID_CFLAGS = @UUID_CFLAGS@ -UUID_LIBS = @UUID_LIBS@ -VERSION = @VERSION@ -XENLIGHT_CFLAGS = @XENLIGHT_CFLAGS@ -XENLIGHT_LIBS = @XENLIGHT_LIBS@ -YAJL_CFLAGS = @YAJL_CFLAGS@ -YAJL_LIBS = @YAJL_LIBS@ -ZLIB_CFLAGS = @ZLIB_CFLAGS@ -ZLIB_LIBS = @ZLIB_LIBS@ -abs_builddir = @abs_builddir@ -abs_srcdir = @abs_srcdir@ -abs_top_builddir = @abs_top_builddir@ -abs_top_srcdir = @abs_top_srcdir@ -ac_ct_CC = @ac_ct_CC@ -ac_ct_CXX = @ac_ct_CXX@ -am__include = @am__include@ -am__leading_dot = @am__leading_dot@ -am__quote = @am__quote@ -am__tar = @am__tar@ -am__untar = @am__untar@ -ax_pthread_config = @ax_pthread_config@ -bindir = @bindir@ -build = @build@ -build_alias = @build_alias@ -build_cpu = @build_cpu@ -build_os = @build_os@ -build_target = @build_target@ -build_vendor = @build_vendor@ -builddir = @builddir@ -cachedir = @cachedir@ -chartsdir = @chartsdir@ -configdir = @configdir@ -datadir = @datadir@ -datarootdir = @datarootdir@ -docdir = @docdir@ -dvidir = @dvidir@ -exec_prefix = @exec_prefix@ -has_jemalloc = @has_jemalloc@ -has_tcmalloc = @has_tcmalloc@ -host = @host@ -host_alias = @host_alias@ -host_cpu = @host_cpu@ -host_os = @host_os@ -host_vendor = @host_vendor@ -htmldir = @htmldir@ -includedir = @includedir@ -infodir = @infodir@ -install_sh = @install_sh@ -libconfigdir = @libconfigdir@ -libdir = @libdir@ -libexecdir = @libexecdir@ -localedir = @localedir@ -localstatedir = @localstatedir@ -logdir = @logdir@ -mandir = @mandir@ -mkdir_p = @mkdir_p@ -nodedir = @nodedir@ -oldincludedir = @oldincludedir@ -pdfdir = @pdfdir@ -pluginsdir = @pluginsdir@ -prefix = @prefix@ -program_transform_name = @program_transform_name@ -psdir = @psdir@ -pythondir = @pythondir@ -registrydir = @registrydir@ -runstatedir = @runstatedir@ -sbindir = @sbindir@ -sharedstatedir = @sharedstatedir@ -srcdir = @srcdir@ -sysconfdir = @sysconfdir@ -target_alias = @target_alias@ -top_build_prefix = @top_build_prefix@ -top_builddir = @top_builddir@ -top_srcdir = @top_srcdir@ -varlibdir = @varlibdir@ -webdir = @webdir@ -AUTOMAKE_OPTIONS = subdir-objects -MAINTAINERCLEANFILES = $(srcdir)/Makefile.in -CLEANFILES = \ - alarm-notify.sh \ - $(NULL) - -SUFFIXES = .in -dist_libconfig_DATA = \ - health_alarm_notify.conf \ - health_email_recipients.conf \ - $(NULL) - -dist_plugins_SCRIPTS = \ - alarm-notify.sh \ - alarm-email.sh \ - alarm-test.sh \ - $(NULL) - - -# install these files - -# install these files - -# install these files - -# install these files - -# install these files - -# install these files - -# install these files - -# install these files - -# install these files - -# install these files - -# install these files - -# install these files - -# install these files - -# install these files - -# install these files - -# install these files - -# install these files - -# install these files - -# install these files - -# install these files -dist_noinst_DATA = alarm-notify.sh.in README.md $(NULL) \ - alerta/README.md alerta/Makefile.inc $(NULL) awssns/README.md \ - awssns/Makefile.inc $(NULL) discord/README.md \ - discord/Makefile.inc $(NULL) email/README.md \ - email/Makefile.inc $(NULL) flock/README.md flock/Makefile.inc \ - $(NULL) hangouts/README.md hangouts/Makefile.inc $(NULL) \ - irc/README.md irc/Makefile.inc $(NULL) kavenegar/README.md \ - kavenegar/Makefile.inc $(NULL) messagebird/README.md \ - messagebird/Makefile.inc $(NULL) pagerduty/README.md \ - pagerduty/Makefile.inc $(NULL) pushbullet/README.md \ - pushbullet/Makefile.inc $(NULL) pushover/README.md \ - pushover/Makefile.inc $(NULL) rocketchat/README.md \ - rocketchat/Makefile.inc $(NULL) slack/README.md \ - slack/Makefile.inc $(NULL) smstools3/README.md \ - smstools3/Makefile.inc $(NULL) syslog/README.md \ - syslog/Makefile.inc $(NULL) telegram/README.md \ - telegram/Makefile.inc $(NULL) twilio/README.md \ - twilio/Makefile.inc $(NULL) web/README.md web/Makefile.inc \ - $(NULL) custom/README.md custom/Makefile.inc $(NULL) -all: all-am - -.SUFFIXES: -.SUFFIXES: .in -$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/build/subst.inc $(srcdir)/alerta/Makefile.inc $(srcdir)/awssns/Makefile.inc $(srcdir)/discord/Makefile.inc $(srcdir)/email/Makefile.inc $(srcdir)/flock/Makefile.inc $(srcdir)/hangouts/Makefile.inc $(srcdir)/irc/Makefile.inc $(srcdir)/kavenegar/Makefile.inc $(srcdir)/messagebird/Makefile.inc $(srcdir)/pagerduty/Makefile.inc $(srcdir)/pushbullet/Makefile.inc $(srcdir)/pushover/Makefile.inc $(srcdir)/rocketchat/Makefile.inc $(srcdir)/slack/Makefile.inc $(srcdir)/smstools3/Makefile.inc $(srcdir)/syslog/Makefile.inc $(srcdir)/telegram/Makefile.inc $(srcdir)/twilio/Makefile.inc $(srcdir)/web/Makefile.inc $(srcdir)/custom/Makefile.inc $(am__configure_deps) - @for dep in $?; do \ - case '$(am__configure_deps)' in \ - *$$dep*) \ - ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ - && { if test -f $@; then exit 0; else break; fi; }; \ - exit 1;; \ - esac; \ - done; \ - echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu health/notifications/Makefile'; \ - $(am__cd) $(top_srcdir) && \ - $(AUTOMAKE) --gnu health/notifications/Makefile -Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status - @case '$?' in \ - *config.status*) \ - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ - *) \ - echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ - cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ - esac; -$(top_srcdir)/build/subst.inc $(srcdir)/alerta/Makefile.inc $(srcdir)/awssns/Makefile.inc $(srcdir)/discord/Makefile.inc $(srcdir)/email/Makefile.inc $(srcdir)/flock/Makefile.inc $(srcdir)/hangouts/Makefile.inc $(srcdir)/irc/Makefile.inc $(srcdir)/kavenegar/Makefile.inc $(srcdir)/messagebird/Makefile.inc $(srcdir)/pagerduty/Makefile.inc $(srcdir)/pushbullet/Makefile.inc $(srcdir)/pushover/Makefile.inc $(srcdir)/rocketchat/Makefile.inc $(srcdir)/slack/Makefile.inc $(srcdir)/smstools3/Makefile.inc $(srcdir)/syslog/Makefile.inc $(srcdir)/telegram/Makefile.inc $(srcdir)/twilio/Makefile.inc $(srcdir)/web/Makefile.inc $(srcdir)/custom/Makefile.inc $(am__empty): - -$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh - -$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh -$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps) - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh -$(am__aclocal_m4_deps): -install-dist_pluginsSCRIPTS: $(dist_plugins_SCRIPTS) - @$(NORMAL_INSTALL) - @list='$(dist_plugins_SCRIPTS)'; test -n "$(pluginsdir)" || list=; \ - if test -n "$$list"; then \ - echo " $(MKDIR_P) '$(DESTDIR)$(pluginsdir)'"; \ - $(MKDIR_P) "$(DESTDIR)$(pluginsdir)" || exit 1; \ - fi; \ - for p in $$list; do \ - if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ - if test -f "$$d$$p"; then echo "$$d$$p"; echo "$$p"; else :; fi; \ - done | \ - sed -e 'p;s,.*/,,;n' \ - -e 'h;s|.*|.|' \ - -e 'p;x;s,.*/,,;$(transform)' | sed 'N;N;N;s,\n, ,g' | \ - $(AWK) 'BEGIN { files["."] = ""; dirs["."] = 1; } \ - { d=$$3; if (dirs[d] != 1) { print "d", d; dirs[d] = 1 } \ - if ($$2 == $$4) { files[d] = files[d] " " $$1; \ - if (++n[d] == $(am__install_max)) { \ - print "f", d, files[d]; n[d] = 0; files[d] = "" } } \ - else { print "f", d "/" $$4, $$1 } } \ - END { for (d in files) print "f", d, files[d] }' | \ - while read type dir files; do \ - if test "$$dir" = .; then dir=; else dir=/$$dir; fi; \ - test -z "$$files" || { \ - echo " $(INSTALL_SCRIPT) $$files '$(DESTDIR)$(pluginsdir)$$dir'"; \ - $(INSTALL_SCRIPT) $$files "$(DESTDIR)$(pluginsdir)$$dir" || exit $$?; \ - } \ - ; done - -uninstall-dist_pluginsSCRIPTS: - @$(NORMAL_UNINSTALL) - @list='$(dist_plugins_SCRIPTS)'; test -n "$(pluginsdir)" || exit 0; \ - files=`for p in $$list; do echo "$$p"; done | \ - sed -e 's,.*/,,;$(transform)'`; \ - dir='$(DESTDIR)$(pluginsdir)'; $(am__uninstall_files_from_dir) -install-dist_libconfigDATA: $(dist_libconfig_DATA) - @$(NORMAL_INSTALL) - @list='$(dist_libconfig_DATA)'; test -n "$(libconfigdir)" || list=; \ - if test -n "$$list"; then \ - echo " $(MKDIR_P) '$(DESTDIR)$(libconfigdir)'"; \ - $(MKDIR_P) "$(DESTDIR)$(libconfigdir)" || exit 1; \ - fi; \ - for p in $$list; do \ - if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ - echo "$$d$$p"; \ - done | $(am__base_list) | \ - while read files; do \ - echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(libconfigdir)'"; \ - $(INSTALL_DATA) $$files "$(DESTDIR)$(libconfigdir)" || exit $$?; \ - done - -uninstall-dist_libconfigDATA: - @$(NORMAL_UNINSTALL) - @list='$(dist_libconfig_DATA)'; test -n "$(libconfigdir)" || list=; \ - files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ - dir='$(DESTDIR)$(libconfigdir)'; $(am__uninstall_files_from_dir) -tags TAGS: - -ctags CTAGS: - -cscope cscopelist: - - -distdir: $(DISTFILES) - @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ - topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ - list='$(DISTFILES)'; \ - dist_files=`for file in $$list; do echo $$file; done | \ - sed -e "s|^$$srcdirstrip/||;t" \ - -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ - case $$dist_files in \ - */*) $(MKDIR_P) `echo "$$dist_files" | \ - sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ - sort -u` ;; \ - esac; \ - for file in $$dist_files; do \ - if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ - if test -d $$d/$$file; then \ - dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ - if test -d "$(distdir)/$$file"; then \ - find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ - fi; \ - if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ - cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ - find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ - fi; \ - cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ - else \ - test -f "$(distdir)/$$file" \ - || cp -p $$d/$$file "$(distdir)/$$file" \ - || exit 1; \ - fi; \ - done -check-am: all-am -check: check-am -all-am: Makefile $(SCRIPTS) $(DATA) -installdirs: - for dir in "$(DESTDIR)$(pluginsdir)" "$(DESTDIR)$(libconfigdir)"; do \ - test -z "$$dir" || $(MKDIR_P) "$$dir"; \ - done -install: install-am -install-exec: install-exec-am -install-data: install-data-am -uninstall: uninstall-am - -install-am: all-am - @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am - -installcheck: installcheck-am -install-strip: - if test -z '$(STRIP)'; then \ - $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ - install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ - install; \ - else \ - $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ - install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ - "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ - fi -mostlyclean-generic: - -clean-generic: - -test -z "$(CLEANFILES)" || rm -f $(CLEANFILES) - -distclean-generic: - -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) - -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) - -maintainer-clean-generic: - @echo "This command is intended for maintainers to use" - @echo "it deletes files that may require special tools to rebuild." - -test -z "$(MAINTAINERCLEANFILES)" || rm -f $(MAINTAINERCLEANFILES) -clean: clean-am - -clean-am: clean-generic mostlyclean-am - -distclean: distclean-am - -rm -f Makefile -distclean-am: clean-am distclean-generic - -dvi: dvi-am - -dvi-am: - -html: html-am - -html-am: - -info: info-am - -info-am: - -install-data-am: install-dist_libconfigDATA \ - install-dist_pluginsSCRIPTS - -install-dvi: install-dvi-am - -install-dvi-am: - -install-exec-am: - -install-html: install-html-am - -install-html-am: - -install-info: install-info-am - -install-info-am: - -install-man: - -install-pdf: install-pdf-am - -install-pdf-am: - -install-ps: install-ps-am - -install-ps-am: - -installcheck-am: - -maintainer-clean: maintainer-clean-am - -rm -f Makefile -maintainer-clean-am: distclean-am maintainer-clean-generic - -mostlyclean: mostlyclean-am - -mostlyclean-am: mostlyclean-generic - -pdf: pdf-am - -pdf-am: - -ps: ps-am - -ps-am: - -uninstall-am: uninstall-dist_libconfigDATA \ - uninstall-dist_pluginsSCRIPTS - -.MAKE: install-am install-strip - -.PHONY: all all-am check check-am clean clean-generic cscopelist-am \ - ctags-am distclean distclean-generic distdir dvi dvi-am html \ - html-am info info-am install install-am install-data \ - install-data-am install-dist_libconfigDATA \ - install-dist_pluginsSCRIPTS install-dvi install-dvi-am \ - install-exec install-exec-am install-html install-html-am \ - install-info install-info-am install-man install-pdf \ - install-pdf-am install-ps install-ps-am install-strip \ - installcheck installcheck-am installdirs maintainer-clean \ - maintainer-clean-generic mostlyclean mostlyclean-generic pdf \ - pdf-am ps ps-am tags-am uninstall uninstall-am \ - uninstall-dist_libconfigDATA uninstall-dist_pluginsSCRIPTS - -.PRECIOUS: Makefile - -.in: - if sed \ - -e 's#[@]localstatedir_POST@#$(localstatedir)#g' \ - -e 's#[@]sbindir_POST@#$(sbindir)#g' \ - -e 's#[@]pluginsdir_POST@#$(pluginsdir)#g' \ - -e 's#[@]configdir_POST@#$(configdir)#g' \ - -e 's#[@]libconfigdir_POST@#$(libconfigdir)#g' \ - -e 's#[@]cachedir_POST@#$(cachedir)#g' \ - -e 's#[@]registrydir_POST@#$(registrydir)#g' \ - -e 's#[@]varlibdir_POST@#$(varlibdir)#g' \ - $< > $@.tmp; then \ - mv "$@.tmp" "$@"; \ - else \ - rm -f "$@.tmp"; \ - false; \ - fi - -# Tell versions [3.59,3.63) of GNU make to not export all variables. -# Otherwise a system limit (for SysV at least) may be exceeded. -.NOEXPORT: diff --git a/health/notifications/README.md b/health/notifications/README.md index c086e79b1..53541981d 100644 --- a/health/notifications/README.md +++ b/health/notifications/README.md @@ -1,7 +1,13 @@ -# Netdata alarm notifications + + +# Alarm notifications The `exec` line in health configuration defines an external script that will be called once -the alarm is triggered. The default script is **[alarm-notify.sh](alarm-notify.sh.in)**. +the alarm is triggered. The default script is `alarm-notify.sh`. You can change the default script globally by editing `/etc/netdata/netdata.conf`. @@ -13,11 +19,10 @@ You can change the default script globally by editing `/etc/netdata/netdata.conf It uses **roles**. For example `sysadmin`, `webmaster`, `dba`, etc. -Each alarm is assigned to one or more roles, using the `to` line of the alarm configuration. -Then `alarm-notify.sh` uses its own configuration file `/etc/netdata/health_alarm_notify.conf` -the default is [here](health_alarm_notify.conf) -(to edit it on your system run `/etc/netdata/edit-config health_alarm_notify.conf`) -to find the destination address of the notification for each method. +Each alarm is assigned to one or more roles, using the `to` line of the alarm configuration. Then `alarm-notify.sh` uses +its own configuration file `/etc/netdata/health_alarm_notify.conf`. To edit it on your system, run +`/etc/netdata/edit-config health_alarm_notify.conf` and find the destination address of the notification for each +method. Each role may have one or more destinations. @@ -31,8 +36,7 @@ So, for example the `sysadmin` role may send: ## Configuration -Edit [`/etc/netdata/health_alarm_notify.conf`](health_alarm_notify.conf) -by running `/etc/netdata/edit-config health_alarm_notify.conf`: +Edit `/etc/netdata/health_alarm_notify.conf` by running `/etc/netdata/edit-config health_alarm_notify.conf`: - settings per notification method: diff --git a/health/notifications/alarm-notify.sh b/health/notifications/alarm-notify.sh deleted file mode 100644 index 5d41ba959..000000000 --- a/health/notifications/alarm-notify.sh +++ /dev/null @@ -1,2428 +0,0 @@ -#!/usr/bin/env bash -#shellcheck source=/dev/null disable=SC2086,SC2154 - -# netdata -# real-time performance and health monitoring, done right! -# (C) 2017 Costa Tsaousis -# SPDX-License-Identifier: GPL-3.0-or-later -# -# Script to send alarm notifications for netdata -# -# Features: -# - multiple notification methods -# - multiple roles per alarm -# - multiple recipients per role -# - severity filtering per recipient -# -# Supported notification methods: -# - emails by @ktsaou -# - slack.com notifications by @ktsaou -# - alerta.io notifications by @kattunga -# - discordapp.com notifications by @lowfive -# - pushover.net notifications by @ktsaou -# - pushbullet.com push notifications by Tiago Peralta @tperalta82 #1070 -# - telegram.org notifications by @hashworks #1002 -# - twilio.com notifications by Levi Blaney @shadycuz #1211 -# - kafka notifications by @ktsaou #1342 -# - pagerduty.com notifications by Jim Cooley @jimcooley #1373 -# - messagebird.com notifications by @tech_no_logical #1453 -# - hipchat notifications by @ktsaou #1561 -# - fleep notifications by @Ferroin -# - prowlapp.com notifications by @Ferroin -# - irc notifications by @manosf -# - custom notifications by @ktsaou -# - syslog messages by @Ferroin -# - Microsoft Team notification by @tioumen -# - RocketChat notifications by @Hermsi1337 #3777 -# - Google Hangouts Chat notifications by @EnzoAkira and @hendrikhofstadt - -# ----------------------------------------------------------------------------- -# testing notifications - -if { [ "${1}" = "test" ] || [ "${2}" = "test" ]; } && [ "${#}" -le 2 ]; then - if [ "${2}" = "test" ]; then - recipient="${1}" - else - recipient="${2}" - fi - - [ -z "${recipient}" ] && recipient="sysadmin" - - id=1 - last="CLEAR" - test_res=0 - for x in "WARNING" "CRITICAL" "CLEAR"; do - echo >&2 - echo >&2 "# SENDING TEST ${x} ALARM TO ROLE: ${recipient}" - - "${0}" "${recipient}" "$(hostname)" 1 1 "${id}" "$(date +%s)" "test_alarm" "test.chart" "test.family" "${x}" "${last}" 100 90 "${0}" 1 $((0 + id)) "units" "this is a test alarm to verify notifications work" "new value" "old value" "evaluated expression" "expression variable values" 0 0 - #shellcheck disable=SC2181 - if [ $? -ne 0 ]; then - echo >&2 "# FAILED" - test_res=1 - else - echo >&2 "# OK" - fi - - last="${x}" - id=$((id + 1)) - done - - exit $test_res -fi - -export PATH="${PATH}:/sbin:/usr/sbin:/usr/local/sbin" -export LC_ALL=C - -# ----------------------------------------------------------------------------- - -PROGRAM_NAME="$(basename "${0}")" - -logdate() { - date "+%Y-%m-%d %H:%M:%S" -} - -log() { - local status="${1}" - shift - - echo >&2 "$(logdate): ${PROGRAM_NAME}: ${status}: ${*}" - -} - -warning() { - log WARNING "${@}" -} - -error() { - log ERROR "${@}" -} - -info() { - log INFO "${@}" -} - -fatal() { - log FATAL "${@}" - exit 1 -} - -debug=${NETDATA_ALARM_NOTIFY_DEBUG-0} -debug() { - [ "${debug}" = "1" ] && log DEBUG "${@}" -} - -docurl() { - if [ -z "${curl}" ]; then - error "${curl} is unset." - return 1 - fi - - if [ "${debug}" = "1" ]; then - echo >&2 "--- BEGIN curl command ---" - printf >&2 "%q " ${curl} "${@}" - echo >&2 - echo >&2 "--- END curl command ---" - - local out code ret - out=$(mktemp /tmp/netdata-health-alarm-notify-XXXXXXXX) - code=$(${curl} ${curl_options} --write-out "%{http_code}" --output "${out}" --silent --show-error "${@}") - ret=$? - echo >&2 "--- BEGIN received response ---" - cat >&2 "${out}" - echo >&2 - echo >&2 "--- END received response ---" - echo >&2 "RECEIVED HTTP RESPONSE CODE: ${code}" - rm "${out}" - echo "${code}" - return ${ret} - fi - - ${curl} ${curl_options} --write-out "%{http_code}" --output /dev/null --silent --show-error "${@}" - return $? -} - -# ----------------------------------------------------------------------------- -# List of all the notification mechanisms we support. -# Used in a couple of places to write more compact code. - -method_names=" -email -pushover -pushbullet -telegram -slack -alerta -flock -discord -hipchat -twilio -messagebird -pd -fleep -syslog -custom -msteam -kavenegar -prowl -irc -awssns -rocketchat -sms -hangouts -" - -# ----------------------------------------------------------------------------- -# this is to be overwritten by the config file - -custom_sender() { - info "not sending custom notification for ${status} of '${host}.${chart}.${name}'" -} - -# ----------------------------------------------------------------------------- - -# check for BASH v4+ (required for associative arrays) -if [ ${BASH_VERSINFO[0]} -lt 4 ]; then - fatal "BASH version 4 or later is required (this is ${BASH_VERSION})." -fi - -# ----------------------------------------------------------------------------- -# defaults to allow running this script by hand - -[ -z "${NETDATA_USER_CONFIG_DIR}" ] && NETDATA_USER_CONFIG_DIR="/etc/netdata" -[ -z "${NETDATA_STOCK_CONFIG_DIR}" ] && NETDATA_STOCK_CONFIG_DIR="/usr/lib/netdata/conf.d" -[ -z "${NETDATA_CACHE_DIR}" ] && NETDATA_CACHE_DIR="/var/cache/netdata" -[ -z "${NETDATA_REGISTRY_URL}" ] && NETDATA_REGISTRY_URL="https://registry.my-netdata.io" -[ -z "${NETDATA_REGISTRY_CLOUD_BASE_URL}" ] && NETDATA_REGISTRY_CLOUD_BASE_URL="https://netdata.cloud" - -# ----------------------------------------------------------------------------- -# parse command line parameters - -if [[ ${1} = "unittest" ]]; then - unittest=1 # enable unit testing mode - roles="${2}" # the role that should be used for unit testing - cfgfile="${3}" # the location of the config file to use for unit testing - status="${4}" # the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL - old_status="${5}" # the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL -else - roles="${1}" # the roles that should be notified for this event - args_host="${2}" # the host generated this event - unique_id="${3}" # the unique id of this event - alarm_id="${4}" # the unique id of the alarm that generated this event - event_id="${5}" # the incremental id of the event, for this alarm id - when="${6}" # the timestamp this event occurred - name="${7}" # the name of the alarm, as given in netdata health.d entries - chart="${8}" # the name of the chart (type.id) - family="${9}" # the family of the chart - status="${10}" # the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL - old_status="${11}" # the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL - value="${12}" # the current value of the alarm - old_value="${13}" # the previous value of the alarm - src="${14}" # the line number and file the alarm has been configured - duration="${15}" # the duration in seconds of the previous alarm state - non_clear_duration="${16}" # the total duration in seconds this is/was non-clear - units="${17}" # the units of the value - info="${18}" # a short description of the alarm - value_string="${19}" # friendly value (with units) - # shellcheck disable=SC2034 - # variable is unused, but https://github.com/netdata/netdata/pull/5164#discussion_r255572947 - old_value_string="${20}" # friendly old value (with units), previously named "old_value_string" - calc_expression="${21}" # contains the expression that was evaluated to trigger the alarm - calc_param_values="${22}" # the values of the parameters in the expression, at the time of the evaluation - total_warnings="${23}" # Total number of alarms in WARNING state - total_critical="${24}" # Total number of alarms in CRITICAL state -fi - -# ----------------------------------------------------------------------------- -# find a suitable hostname to use, if netdata did not supply a hostname - -if [ -z ${args_host} ]; then - this_host=$(hostname -s 2>/dev/null) - host="${this_host}" - args_host="${this_host}" -else - host="${args_host}" -fi - -# ----------------------------------------------------------------------------- -# screen statuses we don't need to send a notification - -# don't do anything if this is not WARNING, CRITICAL or CLEAR -if [ "${status}" != "WARNING" ] && [ "${status}" != "CRITICAL" ] && [ "${status}" != "CLEAR" ]; then - info "not sending notification for ${status} of '${host}.${chart}.${name}'" - exit 1 -fi - -# don't do anything if this is CLEAR, but it was not WARNING or CRITICAL -if [ "${clear_alarm_always}" != "YES" ] && [ "${old_status}" != "WARNING" ] && [ "${old_status}" != "CRITICAL" ] && [ "${status}" = "CLEAR" ]; then - info "not sending notification for ${status} of '${host}.${chart}.${name}' (last status was ${old_status})" - exit 1 -fi - -# ----------------------------------------------------------------------------- -# load configuration - -# By default fetch images from the global public registry. -# This is required by default, since all notification methods need to download -# images via the Internet, and private registries might not be reachable. -# This can be overwritten at the configuration file. -images_base_url="https://registry.my-netdata.io" - -# curl options to use -curl_options="" - -# hostname handling -use_fqdn="NO" - -# needed commands -# if empty they will be searched in the system path -curl= -sendmail= - -# enable / disable features -for method_name in ${method_names^^}; do - declare SEND_${method_name}="YES" - declare DEFAULT_RECIPIENT_${method_name} -done - -for method_name in ${method_names}; do - declare -A role_recipients_${method_name} -done - -# slack configs -SLACK_WEBHOOK_URL= - -# Microsoft Team configs -MSTEAM_WEBHOOK_URL= - -# rocketchat configs -ROCKETCHAT_WEBHOOK_URL= - -# alerta configs -ALERTA_WEBHOOK_URL= -ALERTA_API_KEY= - -# flock configs -FLOCK_WEBHOOK_URL= - -# discord configs -DISCORD_WEBHOOK_URL= - -# pushover configs -PUSHOVER_APP_TOKEN= - -# pushbullet configs -PUSHBULLET_ACCESS_TOKEN= -PUSHBULLET_SOURCE_DEVICE= - -# twilio configs -TWILIO_ACCOUNT_SID= -TWILIO_ACCOUNT_TOKEN= -TWILIO_NUMBER= - -# hipchat configs -HIPCHAT_SERVER= -HIPCHAT_AUTH_TOKEN= - -# messagebird configs -MESSAGEBIRD_ACCESS_KEY= -MESSAGEBIRD_NUMBER= - -# kavenegar configs -KAVENEGAR_API_KEY= -KAVENEGAR_SENDER= - -# telegram configs -TELEGRAM_BOT_TOKEN= - -# kafka configs -SEND_KAFKA="YES" -KAFKA_URL= -KAFKA_SENDER_IP= - -# pagerduty.com configs -PD_SERVICE_KEY= - -# fleep.io configs -FLEEP_SENDER="${host}" - -# Amazon SNS configs -AWSSNS_MESSAGE_FORMAT= - -# syslog configs -SYSLOG_FACILITY= - -# email configs -EMAIL_SENDER= -EMAIL_CHARSET=$(locale charmap 2>/dev/null) -EMAIL_THREADING= -EMAIL_PLAINTEXT_ONLY= - -# irc configs -IRC_NICKNAME= -IRC_REALNAME= -IRC_NETWORK= - -# hangouts configs -declare -A HANGOUTS_WEBHOOK_URI - -# load the stock and user configuration files -# these will overwrite the variables above - -if [ ${unittest} ]; then - if source "${cfgfile}"; then - error "Failed to load requested config file." - exit 1 - fi -else - for CONFIG in "${NETDATA_STOCK_CONFIG_DIR}/health_alarm_notify.conf" "${NETDATA_USER_CONFIG_DIR}/health_alarm_notify.conf"; do - if [ -f "${CONFIG}" ]; then - debug "Loading config file '${CONFIG}'..." - source "${CONFIG}" || error "Failed to load config file '${CONFIG}'." - else - warning "Cannot find file '${CONFIG}'." - fi - done -fi - -# If we didn't autodetect the character set for e-mail and it wasn't -# set by the user, we need to set it to a reasonable default. UTF-8 -# should be correct for almost all modern UNIX systems. -if [ -z ${EMAIL_CHARSET} ]; then - EMAIL_CHARSET="UTF-8" -fi - -# If we've been asked to use FQDN's for the URL's in the alarm, do so, -# unless we're sending an alarm for a slave system which we can't get the -# FQDN of easily. -if [ "${use_fqdn}" = "YES" ] && [ "${host}" = "$(hostname -s 2>/dev/null)" ]; then - host="$(hostname -f 2>/dev/null)" -fi - -# ----------------------------------------------------------------------------- -# filter a recipient based on alarm event severity - -filter_recipient_by_criticality() { - local method="${1}" x="${2}" r s - shift - - r="${x/|*/}" # the recipient - s="${x/*|/}" # the severity required for notifying this recipient - - # no severity filtering for this person - [ "${r}" = "${s}" ] && return 0 - - # the severity is invalid - s="${s^^}" - if [ "${s}" != "CRITICAL" ]; then - error "SEVERITY FILTERING for ${x} VIA ${method}: invalid severity '${s,,}', only 'critical' is supported." - return 0 - fi - - # create the status tracking directory for this user - [ ! -d "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}" ] && - mkdir -p "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}" - - case "${status}" in - CRITICAL) - # make sure he will get future notifications for this alarm too - touch "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}/${alarm_id}" - debug "SEVERITY FILTERING for ${x} VIA ${method}: ALLOW: the alarm is CRITICAL (will now receive next status change)" - return 0 - ;; - - WARNING) - if [ -f "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}/${alarm_id}" ]; then - # we do not remove the file, so that he will get future notifications of this alarm - debug "SEVERITY FILTERING for ${x} VIA ${method}: ALLOW: recipient has been notified for this alarm in the past (will still receive next status change)" - return 0 - fi - ;; - - *) - if [ -f "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}/${alarm_id}" ]; then - # remove the file, so that he will only receive notifications for CRITICAL states for this alarm - rm "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}/${alarm_id}" - debug "SEVERITY FILTERING for ${x} VIA ${method}: ALLOW: recipient has been notified for this alarm (will only receive CRITICAL notifications from now on)" - return 0 - fi - ;; - esac - - debug "SEVERITY FILTERING for ${x} VIA ${method}: BLOCK: recipient should not receive this notification" - return 1 -} - -# ----------------------------------------------------------------------------- -# verify the delivery methods supported - -# check slack -[ -z "${SLACK_WEBHOOK_URL}" ] && SEND_SLACK="NO" - -# check rocketchat -[ -z "${ROCKETCHAT_WEBHOOK_URL}" ] && SEND_ROCKETCHAT="NO" - -# check alerta -[ -z "${ALERTA_WEBHOOK_URL}" ] && SEND_ALERTA="NO" - -# check flock -[ -z "${FLOCK_WEBHOOK_URL}" ] && SEND_FLOCK="NO" - -# check discord -[ -z "${DISCORD_WEBHOOK_URL}" ] && SEND_DISCORD="NO" - -# check pushover -[ -z "${PUSHOVER_APP_TOKEN}" ] && SEND_PUSHOVER="NO" - -# check pushbullet -[ -z "${PUSHBULLET_ACCESS_TOKEN}" ] && SEND_PUSHBULLET="NO" - -# check twilio -{ [ -z "${TWILIO_ACCOUNT_TOKEN}" ] || [ -z "${TWILIO_ACCOUNT_SID}" ] || [ -z "${TWILIO_NUMBER}" ]; } && SEND_TWILIO="NO" - -# check hipchat -[ -z "${HIPCHAT_AUTH_TOKEN}" ] && SEND_HIPCHAT="NO" - -# check messagebird -{ [ -z "${MESSAGEBIRD_ACCESS_KEY}" ] || [ -z "${MESSAGEBIRD_NUMBER}" ]; } && SEND_MESSAGEBIRD="NO" - -# check kavenegar -{ [ -z "${KAVENEGAR_API_KEY}" ] || [ -z "${KAVENEGAR_SENDER}" ]; } && SEND_KAVENEGAR="NO" - -# check telegram -[ -z "${TELEGRAM_BOT_TOKEN}" ] && SEND_TELEGRAM="NO" - -# check kafka -{ [ -z "${KAFKA_URL}" ] || [ -z "${KAFKA_SENDER_IP}" ]; } && SEND_KAFKA="NO" - -# check irc -[ -z "${IRC_NETWORK}" ] && SEND_IRC="NO" - -# check hangouts -[ ${#HANGOUTS_WEBHOOK_URI[@]} -eq 0 ] && SEND_HANGOUTS="NO" - -# check fleep -#shellcheck disable=SC2153 -{ [ -z "${FLEEP_SERVER}" ] || [ -z "${FLEEP_SENDER}" ]; } && SEND_FLEEP="NO" - -if [ "${SEND_PUSHOVER}" = "YES" ] || - [ "${SEND_SLACK}" = "YES" ] || - [ "${SEND_ROCKETCHAT}" = "YES" ] || - [ "${SEND_ALERTA}" = "YES" ] || - [ "${SEND_PD}" = "YES" ] || - [ "${SEND_FLOCK}" = "YES" ] || - [ "${SEND_DISCORD}" = "YES" ] || - [ "${SEND_HIPCHAT}" = "YES" ] || - [ "${SEND_TWILIO}" = "YES" ] || - [ "${SEND_MESSAGEBIRD}" = "YES" ] || - [ "${SEND_KAVENEGAR}" = "YES" ] || - [ "${SEND_TELEGRAM}" = "YES" ] || - [ "${SEND_PUSHBULLET}" = "YES" ] || - [ "${SEND_KAFKA}" = "YES" ] || - [ "${SEND_FLEEP}" = "YES" ] || - [ "${SEND_PROWL}" = "YES" ] || - [ "${SEND_HANGOUTS}" = "YES" ] || - [ "${SEND_CUSTOM}" = "YES" ] || - [ "${SEND_MSTEAM}" = "YES" ]; then - # if we need curl, check for the curl command - if [ -z "${curl}" ]; then - curl="$(command -v curl 2>/dev/null)" - fi - if [ -z "${curl}" ]; then - error "Cannot find curl command in the system path. Disabling all curl based notifications." - SEND_PUSHOVER="NO" - SEND_PUSHBULLET="NO" - SEND_TELEGRAM="NO" - SEND_SLACK="NO" - SEND_MSTEAM="NO" - SEND_ROCKETCHAT="NO" - SEND_ALERTA="NO" - SEND_PD="NO" - SEND_FLOCK="NO" - SEND_DISCORD="NO" - SEND_TWILIO="NO" - SEND_HIPCHAT="NO" - SEND_MESSAGEBIRD="NO" - SEND_KAVENEGAR="NO" - SEND_KAFKA="NO" - SEND_FLEEP="NO" - SEND_PROWL="NO" - SEND_HANGOUTS="NO" - SEND_CUSTOM="NO" - fi -fi - -if [ "${SEND_SMS}" = "YES" ]; then - if [ -z "${sendsms}" ]; then - sendsms="$(command -v sendsms 2>/dev/null)" - fi - if [ -z "${sendsms}" ]; then - SEND_SMS="NO" - fi -fi -# if we need sendmail, check for the sendmail command -if [ "${SEND_EMAIL}" = "YES" ] && [ -z "${sendmail}" ]; then - sendmail="$(command -v sendmail 2>/dev/null)" - if [ -z "${sendmail}" ]; then - debug "Cannot find sendmail command in the system path. Disabling email notifications." - SEND_EMAIL="NO" - fi -fi - -# if we need logger, check for the logger command -if [ "${SEND_SYSLOG}" = "YES" ] && [ -z "${logger}" ]; then - logger="$(command -v logger 2>/dev/null)" - if [ -z "${logger}" ]; then - debug "Cannot find logger command in the system path. Disabling syslog notifications." - SEND_SYSLOG="NO" - fi -fi - -# if we need aws, check for the aws command -if [ "${SEND_AWSSNS}" = "YES" ] && [ -z "${aws}" ]; then - aws="$(command -v aws 2>/dev/null)" - if [ -z "${aws}" ]; then - debug "Cannot find aws command in the system path. Disabling Amazon SNS notifications." - SEND_AWSSNS="NO" - fi -fi - -# ----------------------------------------------------------------------------- -# find the recipients' addresses per method - -# netdata may call us with multiple roles, and roles may have multiple but -# overlapping recipients - so, here we find the unique recipients. -for method_name in ${method_names}; do - send_var="SEND_${method_name^^}" - if [ "${!send_var}" = "NO" ]; then - continue - fi - - declare -A arr_var=() - - for x in ${roles//,/ }; do - # the roles 'silent' and 'disabled' mean: - # don't send a notification for this role - if [ "${x}" = "silent" ] || [ "${x}" = "disabled" ]; then - continue - fi - - role_recipients="role_recipients_${method_name}[$x]" - default_recipient_var="DEFAULT_RECIPIENT_${method_name^^}" - - a="${!role_recipients}" - [ -z "${a}" ] && a="${!default_recipient_var}" - for r in ${a//,/ }; do - [ "${r}" != "disabled" ] && filter_recipient_by_criticality ${method_name} "${r}" && arr_var[${r/|*/}]="1" - done - done - - # build the list of recipients - to_var="to_${method_name}" - declare to_${method_name}="${!arr_var[*]}" - - [ -z "${!to_var}" ] && declare ${send_var}="NO" -done - -# ----------------------------------------------------------------------------- -# handle fixup of the email recipient list. - -fix_to_email() { - to_email= - while [ -n "${1}" ]; do - [ -n "${to_email}" ] && to_email="${to_email}, " - to_email="${to_email}${1}" - shift 1 - done -} - -# ${to_email} without quotes here -fix_to_email ${to_email} - -# ----------------------------------------------------------------------------- -# handle output if we're running in unit test mode -if [ ${unittest} ]; then - for method_name in ${method_names}; do - to_var="to_${method_name}" - echo "results: ${method_name}: ${!to_var}" - done - exit 0 -fi - -# ----------------------------------------------------------------------------- -# check that we have at least a method enabled -proceed=0 -for method in "${SEND_EMAIL}" \ - "${SEND_PUSHOVER}" \ - "${SEND_TELEGRAM}" \ - "${SEND_SLACK}" \ - "${SEND_ROCKETCHAT}" \ - "${SEND_ALERTA}" \ - "${SEND_FLOCK}" \ - "${SEND_DISCORD}" \ - "${SEND_TWILIO}" \ - "${SEND_HIPCHAT}" \ - "${SEND_MESSAGEBIRD}" \ - "${SEND_KAVENEGAR}" \ - "${SEND_PUSHBULLET}" \ - "${SEND_KAFKA}" \ - "${SEND_PD}" \ - "${SEND_FLEEP}" \ - "${SEND_PROWL}" \ - "${SEND_CUSTOM}" \ - "${SEND_IRC}" \ - "${SEND_HANGOUTS}" \ - "${SEND_AWSSNS}" \ - "${SEND_SYSLOG}" \ - "${SEND_SMS}" \ - "${SEND_MSTEAM}"; do - if [ "${method}" == "YES" ]; then - proceed=1 - break - fi -done -if [ "$proceed" -eq 0 ]; then - fatal "All notification methods are disabled. Not sending notification for host '${host}', chart '${chart}' to '${roles}' for '${name}' = '${value}' for status '${status}'." -fi - -# ----------------------------------------------------------------------------- -# get the date the alarm happened - -date=$(date --date=@${when} "${date_format}" 2>/dev/null) -[ -z "${date}" ] && date=$(date "${date_format}" 2>/dev/null) -[ -z "${date}" ] && date=$(date --date=@${when} 2>/dev/null) -[ -z "${date}" ] && date=$(date 2>/dev/null) - -# ---------------------------------------------------------------------------- -# prepare some extra headers if we've been asked to thread e-mails -if [ "${SEND_EMAIL}" == "YES" ] && [ "${EMAIL_THREADING}" != "NO" ]; then - email_thread_headers="In-Reply-To: <${chart}-${name}@${host}>\\r\\nReferences: <${chart}-${name}@${host}>" -else - email_thread_headers= -fi - -# ----------------------------------------------------------------------------- -# function to URL encode a string - -urlencode() { - local string="${1}" strlen encoded pos c o - - strlen=${#string} - for ((pos = 0; pos < strlen; pos++)); do - c=${string:pos:1} - case "${c}" in - [-_.~a-zA-Z0-9]) - o="${c}" - ;; - - *) - printf -v o '%%%02x' "'${c}" - ;; - esac - encoded+="${o}" - done - - REPLY="${encoded}" - echo "${REPLY}" -} - -# ----------------------------------------------------------------------------- -# function to convert a duration in seconds, to a human readable duration -# using DAYS, MINUTES, SECONDS - -duration4human() { - local s="${1}" d=0 h=0 m=0 ds="day" hs="hour" ms="minute" ss="second" ret - d=$((s / 86400)) - s=$((s - (d * 86400))) - h=$((s / 3600)) - s=$((s - (h * 3600))) - m=$((s / 60)) - s=$((s - (m * 60))) - - if [ ${d} -gt 0 ]; then - [ ${m} -ge 30 ] && h=$((h + 1)) - [ ${d} -gt 1 ] && ds="days" - [ ${h} -gt 1 ] && hs="hours" - if [ ${h} -gt 0 ]; then - ret="${d} ${ds} and ${h} ${hs}" - else - ret="${d} ${ds}" - fi - elif [ ${h} -gt 0 ]; then - [ ${s} -ge 30 ] && m=$((m + 1)) - [ ${h} -gt 1 ] && hs="hours" - [ ${m} -gt 1 ] && ms="minutes" - if [ ${m} -gt 0 ]; then - ret="${h} ${hs} and ${m} ${ms}" - else - ret="${h} ${hs}" - fi - elif [ ${m} -gt 0 ]; then - [ ${m} -gt 1 ] && ms="minutes" - [ ${s} -gt 1 ] && ss="seconds" - if [ ${s} -gt 0 ]; then - ret="${m} ${ms} and ${s} ${ss}" - else - ret="${m} ${ms}" - fi - else - [ ${s} -gt 1 ] && ss="seconds" - ret="${s} ${ss}" - fi - - REPLY="${ret}" - echo "${REPLY}" -} - -# ----------------------------------------------------------------------------- -# email sender - -send_email() { - local ret opts=() sender_email="${EMAIL_SENDER}" sender_name= - if [ "${SEND_EMAIL}" = "YES" ]; then - - if [ -n "${EMAIL_SENDER}" ]; then - if [[ ${EMAIL_SENDER} =~ ^\".*\"\ \<.*\>$ ]]; then - # the name includes double quotes - sender_email="$(echo "${EMAIL_SENDER}" | cut -d '<' -f 2 | cut -d '>' -f 1)" - sender_name="$(echo "${EMAIL_SENDER}" | cut -d '"' -f 2)" - elif [[ ${EMAIL_SENDER} =~ ^\'.*\'\ \<.*\>$ ]]; then - # the name includes single quotes - sender_email="$(echo "${EMAIL_SENDER}" | cut -d '<' -f 2 | cut -d '>' -f 1)" - sender_name="$(echo "${EMAIL_SENDER}" | cut -d "'" -f 2)" - elif [[ ${EMAIL_SENDER} =~ ^.*\ \<.*\>$ ]]; then - # the name does not have any quotes - sender_email="$(echo "${EMAIL_SENDER}" | cut -d '<' -f 2 | cut -d '>' -f 1)" - sender_name="$(echo "${EMAIL_SENDER}" | cut -d '<' -f 1)" - fi - fi - - [ -n "${sender_email}" ] && opts+=(-f "${sender_email}") - [ -n "${sender_name}" ] && opts+=(-F "${sender_name}") - - if [ "${debug}" = "1" ]; then - echo >&2 "--- BEGIN sendmail command ---" - printf >&2 "%q " "${sendmail}" -t "${opts[@]}" - echo >&2 - echo >&2 "--- END sendmail command ---" - fi - - "${sendmail}" -t "${opts[@]}" - ret=$? - - if [ ${ret} -eq 0 ]; then - info "sent email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}'" - return 0 - else - error "failed to send email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}' with error code ${ret}." - return 1 - fi - fi - - return 1 -} - -# ----------------------------------------------------------------------------- -# pushover sender - -send_pushover() { - local apptoken="${1}" usertokens="${2}" when="${3}" url="${4}" status="${5}" title="${6}" message="${7}" httpcode sent=0 user priority - - if [ "${SEND_PUSHOVER}" = "YES" ] && [ -n "${apptoken}" ] && [ -n "${usertokens}" ] && [ -n "${title}" ] && [ -n "${message}" ]; then - - # https://pushover.net/api - priority=-2 - case "${status}" in - CLEAR) priority=-1 ;; # low priority: no sound or vibration - WARNING) priority=0 ;; # normal priority: respect quiet hours - CRITICAL) priority=1 ;; # high priority: bypass quiet hours - *) priority=-2 ;; # lowest priority: no notification at all - esac - - for user in ${usertokens}; do - httpcode=$(docurl \ - --form-string "token=${apptoken}" \ - --form-string "user=${user}" \ - --form-string "html=1" \ - --form-string "title=${title}" \ - --form-string "message=${message}" \ - --form-string "timestamp=${when}" \ - --form-string "url=${url}" \ - --form-string "url_title=Open netdata dashboard to view the alarm" \ - --form-string "priority=${priority}" \ - https://api.pushover.net/1/messages.json) - - if [ "${httpcode}" = "200" ]; then - info "sent pushover notification for: ${host} ${chart}.${name} is ${status} to '${user}'" - sent=$((sent + 1)) - else - error "failed to send pushover notification for: ${host} ${chart}.${name} is ${status} to '${user}' with HTTP error code ${httpcode}." - fi - done - - [ ${sent} -gt 0 ] && return 0 - fi - - return 1 -} - -# ----------------------------------------------------------------------------- -# pushbullet sender - -send_pushbullet() { - local userapikey="${1}" source_device="${2}" recipients="${3}" url="${4}" title="${5}" message="${6}" httpcode sent=0 user - if [ "${SEND_PUSHBULLET}" = "YES" ] && [ -n "${userapikey}" ] && [ -n "${recipients}" ] && [ -n "${message}" ] && [ -n "${title}" ]; then - #https://docs.pushbullet.com/#create-push - for user in ${recipients}; do - httpcode=$(docurl \ - --header 'Access-Token: '${userapikey}'' \ - --header 'Content-Type: application/json' \ - --data-binary @<( - cat < from the message - message="${message///}" - message="${message//<\/small>/}" - - if [ "${SEND_HIPCHAT}" = "YES" ] && [ -n "${HIPCHAT_SERVER}" ] && [ -n "${authtoken}" ] && [ -n "${recipients}" ] && [ -n "${message}" ]; then - # Valid values: html, text. - # Defaults to 'html'. - msg_format="html" - - # Background color for message. Valid values: yellow, green, red, purple, gray, random. Defaults to 'yellow'. - case "${status}" in - WARNING) color="yellow" ;; - CRITICAL) color="red" ;; - CLEAR) color="green" ;; - *) color="gray" ;; - esac - - # Whether this message should trigger a user notification (change the tab color, play a sound, notify mobile phones, etc). - # Each recipient's notification preferences are taken into account. - # Defaults to false. - notify="true" - - for room in ${recipients}; do - httpcode=$(docurl -X POST \ - -H "Content-type: application/json" \ - -H "Authorization: Bearer ${authtoken}" \ - -d "{\"color\": \"${color}\", \"from\": \"${host}\", \"message_format\": \"${msg_format}\", \"message\": \"${message}\", \"notify\": \"${notify}\"}" \ - "https://${HIPCHAT_SERVER}/v2/room/${room}/notification") - - if [ "${httpcode}" = "204" ]; then - info "sent HipChat notification for: ${host} ${chart}.${name} is ${status} to '${room}'" - sent=$((sent + 1)) - else - error "failed to send HipChat notification for: ${host} ${chart}.${name} is ${status} to '${room}' with HTTP error code ${httpcode}." - fi - done - - [ ${sent} -gt 0 ] && return 0 - fi - - return 1 -} - -# ----------------------------------------------------------------------------- -# messagebird sender - -send_messagebird() { - local accesskey="${1}" messagebirdnumber="${2}" recipients="${3}" title="${4}" message="${5}" httpcode sent=0 user - if [ "${SEND_MESSAGEBIRD}" = "YES" ] && [ -n "${accesskey}" ] && [ -n "${messagebirdnumber}" ] && [ -n "${recipients}" ] && [ -n "${message}" ] && [ -n "${title}" ]; then - #https://developers.messagebird.com/docs/messaging - for user in ${recipients}; do - httpcode=$(docurl -X POST \ - --data-urlencode "originator=${messagebirdnumber}" \ - --data-urlencode "recipients=${user}" \ - --data-urlencode "body=${title} ${message}" \ - --data-urlencode "datacoding=auto" \ - -H "Authorization: AccessKey ${accesskey}" \ - "https://rest.messagebird.com/messages") - - if [ "${httpcode}" = "201" ]; then - info "sent Messagebird SMS for: ${host} ${chart}.${name} is ${status} to '${user}'" - sent=$((sent + 1)) - else - error "failed to send Messagebird SMS for: ${host} ${chart}.${name} is ${status} to '${user}' with HTTP error code ${httpcode}." - fi - done - - [ ${sent} -gt 0 ] && return 0 - fi - - return 1 -} - -# ----------------------------------------------------------------------------- -# kavenegar sender - -send_kavenegar() { - local API_KEY="${1}" kavenegarsender="${2}" recipients="${3}" title="${4}" message="${5}" httpcode sent=0 user - if [ "${SEND_KAVENEGAR}" = "YES" ] && [ -n "${API_KEY}" ] && [ -n "${kavenegarsender}" ] && [ -n "${recipients}" ] && [ -n "${message}" ] && [ -n "${title}" ]; then - # http://api.kavenegar.com/v1/{API-KEY}/sms/send.json - for user in ${recipients}; do - httpcode=$(docurl -X POST http://api.kavenegar.com/v1/${API_KEY}/sms/send.json \ - --data-urlencode "sender=${kavenegarsender}" \ - --data-urlencode "receptor=${user}" \ - --data-urlencode "message=${title} ${message}") - - if [ "${httpcode}" = "200" ]; then - info "sent Kavenegar SMS for: ${host} ${chart}.${name} is ${status} to '${user}'" - sent=$((sent + 1)) - else - error "failed to send Kavenegar SMS for: ${host} ${chart}.${name} is ${status} to '${user}' with HTTP error code ${httpcode}." - fi - done - - [ ${sent} -gt 0 ] && return 0 - fi - - return 1 -} - -# ----------------------------------------------------------------------------- -# telegram sender - -send_telegram() { - local bottoken="${1}" chatids="${2}" message="${3}" httpcode sent=0 chatid emoji disableNotification="" - - if [ "${status}" = "CLEAR" ]; then disableNotification="--data-urlencode disable_notification=true"; fi - - case "${status}" in - WARNING) emoji="⚠️" ;; - CRITICAL) emoji="🔴" ;; - CLEAR) emoji="✅" ;; - *) emoji="⚪️" ;; - esac - - if [ "${SEND_TELEGRAM}" = "YES" ] && [ -n "${bottoken}" ] && [ -n "${chatids}" ] && [ -n "${message}" ]; then - for chatid in ${chatids}; do - # https://core.telegram.org/bots/api#sendmessage - httpcode=$(docurl ${disableNotification} \ - --data-urlencode "parse_mode=HTML" \ - --data-urlencode "disable_web_page_preview=true" \ - --data-urlencode "text=${emoji} ${message}" \ - "https://api.telegram.org/bot${bottoken}/sendMessage?chat_id=${chatid}") - - if [ "${httpcode}" = "200" ]; then - info "sent telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}'" - sent=$((sent + 1)) - elif [ "${httpcode}" = "401" ]; then - error "failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}': Wrong bot token." - else - error "failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}' with HTTP error code ${httpcode}." - fi - done - - [ ${sent} -gt 0 ] && return 0 - fi - - return 1 -} - -# ----------------------------------------------------------------------------- -# Microsoft Team sender - -send_msteam() { - - local webhook="${1}" channels="${2}" httpcode sent=0 channel color payload - - [ "${SEND_MSTEAM}" != "YES" ] && return 1 - - case "${status}" in - WARNING) icon="${MSTEAM_ICON_WARNING}" && color="${MSTEAM_COLOR_WARNING}" ;; - CRITICAL) icon="${MSTEAM_ICON_CRITICAL}" && color="${MSTEAM_COLOR_CRITICAL}" ;; - CLEAR) icon="${MSTEAM_ICON_CLEAR}" && color="${MSTEAM_COLOR_CLEAR}" ;; - *) icon="${MSTEAM_ICON_DEFAULT}" && color="${MSTEAM_COLOR_DEFAULT}" ;; - esac - - for channel in ${channels}; do - ## More details are available here regarding the payload syntax options : https://docs.microsoft.com/en-us/outlook/actionable-messages/message-card-reference - ## Online designer : https://acdesignerbeta.azurewebsites.net/ - payload="$( - cat <View Netdata" - }, - "origin": "netdata/${host}", - "type": "netdataAlarm", - "rawData": "${BASH_ARGV[@]}" - } -EOF - )" - - if [ -n "${ALERTA_API_KEY}" ]; then - auth="Key ${ALERTA_API_KEY}" - fi - - httpcode=$(docurl -X POST "${webhook}/alert" -H "Content-Type: application/json" -H "Authorization: $auth" --data "${payload}") - - if [ "${httpcode}" = "200" ] || [ "${httpcode}" = "201" ]; then - info "sent alerta notification for: ${host} ${chart}.${name} is ${status} to '${channel}'" - sent=$((sent + 1)) - elif [ "${httpcode}" = "202" ]; then - info "suppressed alerta notification for: ${host} ${chart}.${name} is ${status} to '${channel}'" - else - error "failed to send alerta notification for: ${host} ${chart}.${name} is ${status} to '${channel}', with HTTP error code ${httpcode}." - fi - done - - [ ${sent} -gt 0 ] && return 0 - - return 1 -} - -# ----------------------------------------------------------------------------- -# flock sender - -send_flock() { - local webhook="${1}" channels="${2}" httpcode sent=0 channel color payload - - [ "${SEND_FLOCK}" != "YES" ] && return 1 - - case "${status}" in - WARNING) color="warning" ;; - CRITICAL) color="danger" ;; - CLEAR) color="good" ;; - *) color="#777777" ;; - esac - - for channel in ${channels}; do - httpcode=$(docurl -X POST "${webhook}" -H "Content-Type: application/json" -d "{ - \"sendAs\": { - \"name\" : \"netdata on ${host}\", - \"profileImage\" : \"${images_base_url}/images/banner-icon-144x144.png\" - }, - \"text\": \"${host} *${status_message}*\", - \"timestamp\": \"${when}\", - \"attachments\": [ - { - \"description\": \"${chart} (${family}) - ${info}\", - \"color\": \"${color}\", - \"title\": \"${alarm}\", - \"url\": \"${goto_url}\", - \"text\": \"${info}\", - \"views\": { - \"image\": { - \"original\": { \"src\": \"${image}\", \"width\": 400, \"height\": 400 }, - \"thumbnail\": { \"src\": \"${image}\", \"width\": 50, \"height\": 50 }, - \"filename\": \"${image}\" - } - } - } - ] - }") - if [ "${httpcode}" = "200" ]; then - info "sent flock notification for: ${host} ${chart}.${name} is ${status} to '${channel}'" - sent=$((sent + 1)) - else - error "failed to send flock notification for: ${host} ${chart}.${name} is ${status} to '${channel}', with HTTP error code ${httpcode}." - fi - done - - [ ${sent} -gt 0 ] && return 0 - - return 1 -} - -# ----------------------------------------------------------------------------- -# discord sender - -send_discord() { - local webhook="${1}/slack" channels="${2}" httpcode sent=0 channel color payload username - - [ "${SEND_DISCORD}" != "YES" ] && return 1 - - case "${status}" in - WARNING) color="warning" ;; - CRITICAL) color="danger" ;; - CLEAR) color="good" ;; - *) color="#777777" ;; - esac - - for channel in ${channels}; do - username="netdata on ${host}" - [ ${#username} -gt 32 ] && username="${username:0:29}..." - - payload="$( - cat </dev/null; then - info "sent Amazon SNS notification for: ${host} ${chart}.${name} is ${status} to '${target}'" - sent=$((sent + 1)) - else - error "failed to send Amazon SNS notification for: ${host} ${chart}.${name} is ${status} to '${target}'" - fi - done - - [ ${sent} -gt 0 ] && return 0 - - return 1 -} - -# ----------------------------------------------------------------------------- -# syslog sender - -send_syslog() { - local facility=${SYSLOG_FACILITY:-"local6"} level='info' targets="${1}" - local priority='' message='' host='' port='' prefix='' - local temp1='' temp2='' - - [ "${SEND_SYSLOG}" = "YES" ] || return 1 - - if [ "${status}" = "CRITICAL" ]; then - level='crit' - elif [ "${status}" = "WARNING" ]; then - level='warning' - fi - - for target in ${targets}; do - priority="${facility}.${level}" - message='' - host='' - port='' - prefix='' - temp1='' - temp2='' - - prefix=$(echo ${target} | cut -d '/' -f 2) - temp1=$(echo ${target} | cut -d '/' -f 1) - - if [ ${prefix} != ${temp1} ]; then - if (echo ${temp1} | grep -q '@'); then - temp2=$(echo ${temp1} | cut -d '@' -f 1) - host=$(echo ${temp1} | cut -d '@' -f 2) - - if [ ${temp2} != ${host} ]; then - priority=${temp2} - fi - - port=$(echo ${host} | rev | cut -d ':' -f 1 | rev) - - if (echo ${host} | grep -E -q '\[.*\]'); then - if (echo ${port} | grep -q ']'); then - port='' - else - host=$(echo ${host} | rev | cut -d ':' -f 2- | rev) - fi - else - if [ ${port} = ${host} ]; then - port='' - else - host=$(echo ${host} | cut -d ':' -f 1) - fi - fi - else - priority=${temp1} - fi - fi - - message="${prefix} ${status} on ${host} at ${date}: ${chart} ${value_string}" - - if [ ${host} ]; then - logger_options="${logger_options} -n ${host}" - if [ ${port} ]; then - logger_options="${logger_options} -P ${port}" - fi - fi - - ${logger} -p ${priority} ${logger_options} "${message}" - done - - return $? -} - -# ----------------------------------------------------------------------------- -# SMS sender - -send_sms() { - local recipients="${1}" errcode errmessage sent=0 - - # Human readable SMS - local msg="${host} ${status_message}: ${chart} (${family}), ${alarm}" - - # limit it to 160 characters - msg="${msg:0:160}" - - if [ "${SEND_SMS}" = "YES" ] && [ -n "${sendsms}" ] && [ -n "${recipients}" ] && [ -n "${msg}" ]; then - # http://api.kavenegar.com/v1/{API-KEY}/sms/send.json - for phone in ${recipients}; do - errmessage=$($sendsms $phone "$msg" 2>&1) - errcode=$? - if [ ${errcode} -eq 0 ]; then - info "sent smstools3 SMS for: ${host} ${chart}.${name} is ${status} to '${user}'" - sent=$((sent + 1)) - else - error "failed to send smstools3 SMS for: ${host} ${chart}.${name} is ${status} to '${user}' with error code ${errcode}: ${errmessage}." - fi - done - - [ ${sent} -gt 0 ] && return 0 - fi - - return 1 -} - -# ----------------------------------------------------------------------------- -# hangouts sender - -send_hangouts() { - local rooms="${1}" httpcode sent=0 room color payload webhook - - [ "${SEND_HANGOUTS}" != "YES" ] && return 1 - - case "${status}" in - WARNING) color="#ffa700" ;; - CRITICAL) color="#d62d20" ;; - CLEAR) color="#008744" ;; - *) color="#777777" ;; - esac - - for room in ${rooms}; do - if [ -z "${HANGOUTS_WEBHOOK_URI[$room]}" ] ; then - info "Can't send Hangouts notification for: ${host} ${chart}.${name} to room ${room}. HANGOUTS_WEBHOOK_URI[$room] not defined" - else - webhook="${HANGOUTS_WEBHOOK_URI[$room]}" - payload="$( - cat <${host}", - "widgets": [ - { - "keyValue": { - "topLabel": "Status Message", - "content": "${status_message}", - "contentMultiline": "true", - "iconUrl": "${image}", - "onClick": { - "openLink": { - "url": "${goto_url}" - } - } - } - }, - { - "keyValue": { - "topLabel": "${chart} | ${family}", - "content": "${alarm}", - "contentMultiline": "true" - } - } - ] - }, - { - "widgets": [ - { - "textParagraph": { - "text": "@ ${date}\n${info}" - } - } - ] - }, - { - "widgets": [ - { - "buttons": [ - { - "textButton": { - "text": "Go to ${host}", - "onClick": { - "openLink": { - "url": "${goto_url}" - } - } - } - } - ] - } - ] - } - ] - } - ] - } -EOF - )" - - httpcode=$(docurl -H "Content-Type: application/json" -X POST -d "${payload}" "${webhook}") - - if [ "${httpcode}" = "200" ]; then - info "sent hangouts notification for: ${host} ${chart}.${name} is ${status} to '${room}'" - sent=$((sent + 1)) - else - error "failed to send hangouts notification for: ${host} ${chart}.${name} is ${status} to '${room}', with HTTP error code ${httpcode}." - fi - fi - done - - [ ${sent} -gt 0 ] && return 0 - - return 1 -} - -# ----------------------------------------------------------------------------- -# prepare the content of the notification - -# the url to send the user on click -urlencode "${args_host}" >/dev/null -url_host="${REPLY}" -urlencode "${chart}" >/dev/null -url_chart="${REPLY}" -urlencode "${family}" >/dev/null -url_family="${REPLY}" -urlencode "${name}" >/dev/null -url_name="${REPLY}" - -redirect_params="host=${url_host}&chart=${url_chart}&family=${url_family}&alarm=${url_name}&alarm_unique_id=${unique_id}&alarm_id=${alarm_id}&alarm_event_id=${event_id}&alarm_when=${when}" -GOTOCLOUD=0 - -if [ "${NETDATA_REGISTRY_URL}" == "https://registry.my-netdata.io" ]; then - if [ -z "${NETDATA_REGISTRY_UNIQUE_ID}" ]; then - if [ -f "/var/lib/netdata/registry/netdata.public.unique.id" ]; then - NETDATA_REGISTRY_UNIQUE_ID="$(cat "/var/lib/netdata/registry/netdata.public.unique.id")" - fi - fi - if [ -n "${NETDATA_REGISTRY_UNIQUE_ID}" ]; then - GOTOCLOUD=1 - fi -fi - -if [ ${GOTOCLOUD} -eq 0 ]; then - goto_url="${NETDATA_REGISTRY_URL}/goto-host-from-alarm.html?${redirect_params}" -else - goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentID=${NETDATA_REGISTRY_UNIQUE_ID}&${redirect_params}" -fi - -# the severity of the alarm -severity="${status}" - -# the time the alarm was raised -duration4human ${duration} >/dev/null -duration_txt="${REPLY}" -duration4human ${non_clear_duration} >/dev/null -non_clear_duration_txt="${REPLY}" -raised_for="(was ${old_status,,} for ${duration_txt})" - -# the key status message -status_message="status unknown" - -# the color of the alarm -color="grey" - -# the alarm value -alarm="${name//_/ } = ${value_string}" - -# the image of the alarm -image="${images_base_url}/images/banner-icon-144x144.png" - -# prepare the title based on status -case "${status}" in -CRITICAL) - image="${images_base_url}/images/alert-128-red.png" - status_message="is critical" - color="#ca414b" - ;; - -WARNING) - image="${images_base_url}/images/alert-128-orange.png" - status_message="needs attention" - color="#ffc107" - ;; - -CLEAR) - image="${images_base_url}/images/check-mark-2-128-green.png" - status_message="recovered" - color="#77ca6d" - ;; -esac - -if [ "${status}" = "CLEAR" ]; then - severity="Recovered from ${old_status}" - if [ ${non_clear_duration} -gt ${duration} ]; then - raised_for="(alarm was raised for ${non_clear_duration_txt})" - fi - - # don't show the value when the status is CLEAR - # for certain alarms, this value might not have any meaning - alarm="${name//_/ } ${raised_for}" - -elif { [ "${old_status}" = "WARNING" ] && [ "${status}" = "CRITICAL" ]; }; then - severity="Escalated to ${status}" - if [ ${non_clear_duration} -gt ${duration} ]; then - raised_for="(alarm is raised for ${non_clear_duration_txt})" - fi - -elif { [ "${old_status}" = "CRITICAL" ] && [ "${status}" = "WARNING" ]; }; then - severity="Demoted to ${status}" - if [ ${non_clear_duration} -gt ${duration} ]; then - raised_for="(alarm is raised for ${non_clear_duration_txt})" - fi - -else - raised_for= -fi - -# prepare HTML versions of elements -info_html= -[ -n "${info}" ] && info_html="
${info}
" - -raised_for_html= -[ -n "${raised_for}" ] && raised_for_html="
${raised_for}" - -# ----------------------------------------------------------------------------- -# send the slack notification - -# slack aggregates posts from the same username -# so we use "${host} ${status}" as the bot username, to make them diff - -send_slack "${SLACK_WEBHOOK_URL}" "${to_slack}" -SENT_SLACK=$? - -# ----------------------------------------------------------------------------- -# send the hangouts notification - -# hangouts aggregates posts from the same room -# so we use "${host} ${status}" as the room, to make them diff - -send_hangouts "${to_hangouts}" -SENT_HANGOUTS=$? - -# ----------------------------------------------------------------------------- -# send the Microsoft notification - -# Microsoft team aggregates posts from the same username -# so we use "${host} ${status}" as the bot username, to make them diff - -send_msteam "${MSTEAM_WEBHOOK_URL}" "${to_msteam}" -SENT_MSTEAM=$? - -# ----------------------------------------------------------------------------- -# send the rocketchat notification - -# rocketchat aggregates posts from the same username -# so we use "${host} ${status}" as the bot username, to make them diff - -send_rocketchat "${ROCKETCHAT_WEBHOOK_URL}" "${to_rocketchat}" -SENT_ROCKETCHAT=$? - -# ----------------------------------------------------------------------------- -# send the alerta notification - -# alerta aggregates posts from the same username -# so we use "${host} ${status}" as the bot username, to make them diff - -send_alerta "${ALERTA_WEBHOOK_URL}" "${to_alerta}" -SENT_ALERTA=$? - -# ----------------------------------------------------------------------------- -# send the flock notification - -# flock aggregates posts from the same username -# so we use "${host} ${status}" as the bot username, to make them diff - -send_flock "${FLOCK_WEBHOOK_URL}" "${to_flock}" -SENT_FLOCK=$? - -# ----------------------------------------------------------------------------- -# send the discord notification - -# discord aggregates posts from the same username -# so we use "${host} ${status}" as the bot username, to make them diff - -send_discord "${DISCORD_WEBHOOK_URL}" "${to_discord}" -SENT_DISCORD=$? - -# ----------------------------------------------------------------------------- -# send the pushover notification - -send_pushover "${PUSHOVER_APP_TOKEN}" "${to_pushover}" "${when}" "${goto_url}" "${status}" "${host} ${status_message} - ${name//_/ } - ${chart}" " -${alarm}${info_html}
  -${chart}
Chart
 
-${family}
Family
 
-${severity}
Severity
 
-${date}${raised_for_html}
Time
 
-View Netdata
  -The source of this alarm is line ${src} -" - -SENT_PUSHOVER=$? - -# ----------------------------------------------------------------------------- -# send the pushbullet notification - -send_pushbullet "${PUSHBULLET_ACCESS_TOKEN}" "${PUSHBULLET_SOURCE_DEVICE}" "${to_pushbullet}" "${goto_url}" "${host} ${status_message} - ${name//_/ } - ${chart}" "${alarm}\\n -Severity: ${severity}\\n -Chart: ${chart}\\n -Family: ${family}\\n -${date}\\n -The source of this alarm is line ${src}" - -SENT_PUSHBULLET=$? - -# ----------------------------------------------------------------------------- -# send the twilio SMS - -send_twilio "${TWILIO_ACCOUNT_SID}" "${TWILIO_ACCOUNT_TOKEN}" "${TWILIO_NUMBER}" "${to_twilio}" "${host} ${status_message} - ${name//_/ } - ${chart}" "${alarm} -Severity: ${severity} -Chart: ${chart} -Family: ${family} -${info}" - -SENT_TWILIO=$? - -# ----------------------------------------------------------------------------- -# send the messagebird SMS - -send_messagebird "${MESSAGEBIRD_ACCESS_KEY}" "${MESSAGEBIRD_NUMBER}" "${to_messagebird}" "${host} ${status_message} - ${name//_/ } - ${chart}" "${alarm} -Severity: ${severity} -Chart: ${chart} -Family: ${family} -${info}" - -SENT_MESSAGEBIRD=$? - -# ----------------------------------------------------------------------------- -# send the kavenegar SMS - -send_kavenegar "${KAVENEGAR_API_KEY}" "${KAVENEGAR_SENDER}" "${to_kavenegar}" "${host} ${status_message} - ${name//_/ } - ${chart}" "${alarm} -Severity: ${severity} -Chart: ${chart} -Family: ${family} -${info}" - -SENT_KAVENEGAR=$? - -# ----------------------------------------------------------------------------- -# send the telegram.org message - -# https://core.telegram.org/bots/api#formatting-options -send_telegram "${TELEGRAM_BOT_TOKEN}" "${to_telegram}" "${host} ${status_message} - ${name//_/ } -${chart} (${family}) -${alarm} -${info}" - -SENT_TELEGRAM=$? - -# ----------------------------------------------------------------------------- -# send the kafka message - -send_kafka -SENT_KAFKA=$? - -# ----------------------------------------------------------------------------- -# send the pagerduty.com message - -send_pd "${to_pd}" -SENT_PD=$? - -# ----------------------------------------------------------------------------- -# send the fleep message - -send_fleep "${to_fleep}" -SENT_FLEEP=$? - -# ----------------------------------------------------------------------------- -# send the Prowl message - -send_prowl "${to_prowl}" -SENT_PROWL=$? - -# ----------------------------------------------------------------------------- -# send the irc message - -send_irc "${IRC_NICKNAME}" "${IRC_REALNAME}" "${to_irc}" "${IRC_NETWORK}" "${host}" "${host} ${status_message} - ${name//_/ } - ${chart} ----- ${alarm} -Severity: ${severity} -Chart: ${chart} -Family: ${family} -${info}" - -SENT_IRC=$? - -# ----------------------------------------------------------------------------- -# send the SMS message with smstools3 - -send_sms "${to_sms}" - -SENT_SMS=$? - -# ----------------------------------------------------------------------------- -# send the custom message - -send_custom() { - # is it enabled? - [ "${SEND_CUSTOM}" != "YES" ] && return 1 - - # do we have any sender? - [ -z "${1}" ] && return 1 - - # call the custom_sender function - custom_sender "${@}" -} - -send_custom "${to_custom}" -SENT_CUSTOM=$? - -# ----------------------------------------------------------------------------- -# send hipchat message - -send_hipchat "${HIPCHAT_AUTH_TOKEN}" "${to_hipchat}" " \ -${host} ${status_message}
\ -${alarm} ${info_html}
\ -${chart} (family ${family})
\ -${date}${raised_for_html}
\ -View netdata dashboard \ -(source of alarm ${src}) \ -" - -SENT_HIPCHAT=$? - -# ----------------------------------------------------------------------------- -# send the Amazon SNS message - -send_awssns "${to_awssns}" - -SENT_AWSSNS=$? - -# ----------------------------------------------------------------------------- -# send the syslog message - -send_syslog "${to_syslog}" - -SENT_SYSLOG=$? - -# ----------------------------------------------------------------------------- -# send the email - -IFS='' read -r -d '' email_plaintext_part < - - - - - - - - - -
-
- - - - - - - - - - - - -
-
netdata notification
-
-

${host} ${status_message}

-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- ${chart} - Chart -
- ${alarm}${info_html} - Alarm -
- ${family} - Family -
- ${severity} - Severity -
${date} - ${raised_for_html} Time -
- ${calc_expression} - Evaluated Expression -
- ${calc_param_values} - Expression Variables -
- The host has ${total_warnings} WARNING and ${total_critical} CRITICAL alarm(s) raised. -
- View Netdata -
The source of this alarm is line ${src}
(alarms are configurable, edit this file to adapt the alarm to your needs) -
Sent by - netdata, the real-time performance and health monitoring, on ${host}. -
-
-
-
-
- - -EOF - -send_email <&2 - echo >&2 "# SENDING TEST ${x} ALARM TO ROLE: ${recipient}" - - "${0}" "${recipient}" "$(hostname)" 1 1 "${id}" "$(date +%s)" "test_alarm" "test.chart" "test.family" "${x}" "${last}" 100 90 "${0}" 1 $((0 + id)) "units" "this is a test alarm to verify notifications work" "new value" "old value" "evaluated expression" "expression variable values" 0 0 - #shellcheck disable=SC2181 - if [ $? -ne 0 ]; then - echo >&2 "# FAILED" - test_res=1 - else - echo >&2 "# OK" - fi - - last="${x}" - id=$((id + 1)) - done - - exit $test_res + if [ "${2}" = "test" ]; then + recipient="${1}" + else + recipient="${2}" + fi + + [ -z "${recipient}" ] && recipient="sysadmin" + + id=1 + last="CLEAR" + test_res=0 + for x in "WARNING" "CRITICAL" "CLEAR"; do + echo >&2 + echo >&2 "# SENDING TEST ${x} ALARM TO ROLE: ${recipient}" + + "${0}" "${recipient}" "$(hostname)" 1 1 "${id}" "$(date +%s)" "test_alarm" "test.chart" "test.family" "${x}" "${last}" 100 90 "${0}" 1 $((0 + id)) "units" "this is a test alarm to verify notifications work" "new value" "old value" "evaluated expression" "expression variable values" 0 0 + #shellcheck disable=SC2181 + if [ $? -ne 0 ]; then + echo >&2 "# FAILED" + test_res=1 + else + echo >&2 "# OK" + fi + + last="${x}" + id=$((id + 1)) + done + + exit $test_res fi export PATH="${PATH}:/sbin:/usr/sbin:/usr/local/sbin" @@ -79,67 +82,67 @@ export LC_ALL=C PROGRAM_NAME="$(basename "${0}")" logdate() { - date "+%Y-%m-%d %H:%M:%S" + date "+%Y-%m-%d %H:%M:%S" } log() { - local status="${1}" - shift + local status="${1}" + shift - echo >&2 "$(logdate): ${PROGRAM_NAME}: ${status}: ${*}" + echo >&2 "$(logdate): ${PROGRAM_NAME}: ${status}: ${*}" } warning() { - log WARNING "${@}" + log WARNING "${@}" } error() { - log ERROR "${@}" + log ERROR "${@}" } info() { - log INFO "${@}" + log INFO "${@}" } fatal() { - log FATAL "${@}" - exit 1 + log FATAL "${@}" + exit 1 } debug=${NETDATA_ALARM_NOTIFY_DEBUG-0} debug() { - [ "${debug}" = "1" ] && log DEBUG "${@}" + [ "${debug}" = "1" ] && log DEBUG "${@}" } docurl() { - if [ -z "${curl}" ]; then - error "${curl} is unset." - return 1 - fi - - if [ "${debug}" = "1" ]; then - echo >&2 "--- BEGIN curl command ---" - printf >&2 "%q " ${curl} "${@}" - echo >&2 - echo >&2 "--- END curl command ---" - - local out code ret - out=$(mktemp /tmp/netdata-health-alarm-notify-XXXXXXXX) - code=$(${curl} ${curl_options} --write-out "%{http_code}" --output "${out}" --silent --show-error "${@}") - ret=$? - echo >&2 "--- BEGIN received response ---" - cat >&2 "${out}" - echo >&2 - echo >&2 "--- END received response ---" - echo >&2 "RECEIVED HTTP RESPONSE CODE: ${code}" - rm "${out}" - echo "${code}" - return ${ret} - fi - - ${curl} ${curl_options} --write-out "%{http_code}" --output /dev/null --silent --show-error "${@}" - return $? + if [ -z "${curl}" ]; then + error "${curl} is unset." + return 1 + fi + + if [ "${debug}" = "1" ]; then + echo >&2 "--- BEGIN curl command ---" + printf >&2 "%q " ${curl} "${@}" + echo >&2 + echo >&2 "--- END curl command ---" + + local out code ret + out=$(mktemp /tmp/netdata-health-alarm-notify-XXXXXXXX) + code=$(${curl} ${curl_options} --write-out "%{http_code}" --output "${out}" --silent --show-error "${@}") + ret=$? + echo >&2 "--- BEGIN received response ---" + cat >&2 "${out}" + echo >&2 + echo >&2 "--- END received response ---" + echo >&2 "RECEIVED HTTP RESPONSE CODE: ${code}" + rm "${out}" + echo "${code}" + return ${ret} + fi + + ${curl} ${curl_options} --write-out "%{http_code}" --output /dev/null --silent --show-error "${@}" + return $? } # ----------------------------------------------------------------------------- @@ -170,20 +173,22 @@ awssns rocketchat sms hangouts +dynatrace +matrix " # ----------------------------------------------------------------------------- # this is to be overwritten by the config file custom_sender() { - info "not sending custom notification for ${status} of '${host}.${chart}.${name}'" + info "not sending custom notification for ${status} of '${host}.${chart}.${name}'" } # ----------------------------------------------------------------------------- # check for BASH v4+ (required for associative arrays) if [ ${BASH_VERSINFO[0]} -lt 4 ]; then - fatal "BASH version 4 or later is required (this is ${BASH_VERSION})." + fatal "BASH version 4 or later is required (this is ${BASH_VERSION})." fi # ----------------------------------------------------------------------------- @@ -193,55 +198,55 @@ fi [ -z "${NETDATA_STOCK_CONFIG_DIR}" ] && NETDATA_STOCK_CONFIG_DIR="@libconfigdir_POST@" [ -z "${NETDATA_CACHE_DIR}" ] && NETDATA_CACHE_DIR="@cachedir_POST@" [ -z "${NETDATA_REGISTRY_URL}" ] && NETDATA_REGISTRY_URL="https://registry.my-netdata.io" -[ -z "${NETDATA_REGISTRY_CLOUD_BASE_URL}" ] && NETDATA_REGISTRY_CLOUD_BASE_URL="https://netdata.cloud" +[ -z "${NETDATA_REGISTRY_CLOUD_BASE_URL}" ] && NETDATA_REGISTRY_CLOUD_BASE_URL="https://app.netdata.cloud" # ----------------------------------------------------------------------------- # parse command line parameters if [[ ${1} = "unittest" ]]; then - unittest=1 # enable unit testing mode - roles="${2}" # the role that should be used for unit testing - cfgfile="${3}" # the location of the config file to use for unit testing - status="${4}" # the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL - old_status="${5}" # the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL + unittest=1 # enable unit testing mode + roles="${2}" # the role that should be used for unit testing + cfgfile="${3}" # the location of the config file to use for unit testing + status="${4}" # the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL + old_status="${5}" # the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL else - roles="${1}" # the roles that should be notified for this event - args_host="${2}" # the host generated this event - unique_id="${3}" # the unique id of this event - alarm_id="${4}" # the unique id of the alarm that generated this event - event_id="${5}" # the incremental id of the event, for this alarm id - when="${6}" # the timestamp this event occurred - name="${7}" # the name of the alarm, as given in netdata health.d entries - chart="${8}" # the name of the chart (type.id) - family="${9}" # the family of the chart - status="${10}" # the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL - old_status="${11}" # the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL - value="${12}" # the current value of the alarm - old_value="${13}" # the previous value of the alarm - src="${14}" # the line number and file the alarm has been configured - duration="${15}" # the duration in seconds of the previous alarm state - non_clear_duration="${16}" # the total duration in seconds this is/was non-clear - units="${17}" # the units of the value - info="${18}" # a short description of the alarm - value_string="${19}" # friendly value (with units) - # shellcheck disable=SC2034 - # variable is unused, but https://github.com/netdata/netdata/pull/5164#discussion_r255572947 - old_value_string="${20}" # friendly old value (with units), previously named "old_value_string" - calc_expression="${21}" # contains the expression that was evaluated to trigger the alarm - calc_param_values="${22}" # the values of the parameters in the expression, at the time of the evaluation - total_warnings="${23}" # Total number of alarms in WARNING state - total_critical="${24}" # Total number of alarms in CRITICAL state + roles="${1}" # the roles that should be notified for this event + args_host="${2}" # the host generated this event + unique_id="${3}" # the unique id of this event + alarm_id="${4}" # the unique id of the alarm that generated this event + event_id="${5}" # the incremental id of the event, for this alarm id + when="${6}" # the timestamp this event occurred + name="${7}" # the name of the alarm, as given in netdata health.d entries + chart="${8}" # the name of the chart (type.id) + family="${9}" # the family of the chart + status="${10}" # the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL + old_status="${11}" # the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL + value="${12}" # the current value of the alarm + old_value="${13}" # the previous value of the alarm + src="${14}" # the line number and file the alarm has been configured + duration="${15}" # the duration in seconds of the previous alarm state + non_clear_duration="${16}" # the total duration in seconds this is/was non-clear + units="${17}" # the units of the value + info="${18}" # a short description of the alarm + value_string="${19}" # friendly value (with units) + # shellcheck disable=SC2034 + # variable is unused, but https://github.com/netdata/netdata/pull/5164#discussion_r255572947 + old_value_string="${20}" # friendly old value (with units), previously named "old_value_string" + calc_expression="${21}" # contains the expression that was evaluated to trigger the alarm + calc_param_values="${22}" # the values of the parameters in the expression, at the time of the evaluation + total_warnings="${23}" # Total number of alarms in WARNING state + total_critical="${24}" # Total number of alarms in CRITICAL state fi # ----------------------------------------------------------------------------- # find a suitable hostname to use, if netdata did not supply a hostname if [ -z ${args_host} ]; then - this_host=$(hostname -s 2>/dev/null) - host="${this_host}" - args_host="${this_host}" + this_host=$(hostname -s 2>/dev/null) + host="${this_host}" + args_host="${this_host}" else - host="${args_host}" + host="${args_host}" fi # ----------------------------------------------------------------------------- @@ -249,14 +254,14 @@ fi # don't do anything if this is not WARNING, CRITICAL or CLEAR if [ "${status}" != "WARNING" ] && [ "${status}" != "CRITICAL" ] && [ "${status}" != "CLEAR" ]; then - info "not sending notification for ${status} of '${host}.${chart}.${name}'" - exit 1 + info "not sending notification for ${status} of '${host}.${chart}.${name}'" + exit 1 fi # don't do anything if this is CLEAR, but it was not WARNING or CRITICAL if [ "${clear_alarm_always}" != "YES" ] && [ "${old_status}" != "WARNING" ] && [ "${old_status}" != "CRITICAL" ] && [ "${status}" = "CLEAR" ]; then - info "not sending notification for ${status} of '${host}.${chart}.${name}' (last status was ${old_status})" - exit 1 + info "not sending notification for ${status} of '${host}.${chart}.${name}' (last status was ${old_status})" + exit 1 fi # ----------------------------------------------------------------------------- @@ -281,12 +286,12 @@ sendmail= # enable / disable features for method_name in ${method_names^^}; do - declare SEND_${method_name}="YES" - declare DEFAULT_RECIPIENT_${method_name} + declare SEND_${method_name}="YES" + declare DEFAULT_RECIPIENT_${method_name} done for method_name in ${method_names}; do - declare -A role_recipients_${method_name} + declare -A role_recipients_${method_name} done # slack configs @@ -342,6 +347,7 @@ KAFKA_SENDER_IP= # pagerduty.com configs PD_SERVICE_KEY= +USE_PD_VERSION= # fleep.io configs FLEEP_SENDER="${host}" @@ -349,6 +355,10 @@ FLEEP_SENDER="${host}" # Amazon SNS configs AWSSNS_MESSAGE_FORMAT= +# Matrix configs +MATRIX_HOMESERVER= +MATRIX_ACCESSTOKEN= + # syslog configs SYSLOG_FACILITY= @@ -365,92 +375,108 @@ IRC_NETWORK= # hangouts configs declare -A HANGOUTS_WEBHOOK_URI +declare -A HANGOUTS_WEBHOOK_THREAD + +# dynatrace configs +DYNATRACE_SPACE= +DYNATRACE_SERVER= +DYNATRACE_TOKEN= +DYNATRACE_TAG_VALUE= +DYNATRACE_ANNOTATION_TYPE= +DYNATRACE_EVENT= +SEND_DYNATRACE= + +# stackpulse configs +STACKPULSE_WEBHOOK= + +# opsgenie configs +OPSGENIE_API_KEY= # load the stock and user configuration files # these will overwrite the variables above if [ ${unittest} ]; then - if source "${cfgfile}"; then - error "Failed to load requested config file." - exit 1 - fi + if source "${cfgfile}"; then + error "Failed to load requested config file." + exit 1 + fi else - for CONFIG in "${NETDATA_STOCK_CONFIG_DIR}/health_alarm_notify.conf" "${NETDATA_USER_CONFIG_DIR}/health_alarm_notify.conf"; do - if [ -f "${CONFIG}" ]; then - debug "Loading config file '${CONFIG}'..." - source "${CONFIG}" || error "Failed to load config file '${CONFIG}'." - else - warning "Cannot find file '${CONFIG}'." - fi - done + for CONFIG in "${NETDATA_STOCK_CONFIG_DIR}/health_alarm_notify.conf" "${NETDATA_USER_CONFIG_DIR}/health_alarm_notify.conf"; do + if [ -f "${CONFIG}" ]; then + debug "Loading config file '${CONFIG}'..." + source "${CONFIG}" || error "Failed to load config file '${CONFIG}'." + else + warning "Cannot find file '${CONFIG}'." + fi + done fi # If we didn't autodetect the character set for e-mail and it wasn't # set by the user, we need to set it to a reasonable default. UTF-8 # should be correct for almost all modern UNIX systems. if [ -z ${EMAIL_CHARSET} ]; then - EMAIL_CHARSET="UTF-8" + EMAIL_CHARSET="UTF-8" fi # If we've been asked to use FQDN's for the URL's in the alarm, do so, -# unless we're sending an alarm for a slave system which we can't get the +# unless we're sending an alarm for a child system which we can't get the # FQDN of easily. if [ "${use_fqdn}" = "YES" ] && [ "${host}" = "$(hostname -s 2>/dev/null)" ]; then - host="$(hostname -f 2>/dev/null)" + host="$(hostname -f 2>/dev/null)" fi # ----------------------------------------------------------------------------- # filter a recipient based on alarm event severity filter_recipient_by_criticality() { - local method="${1}" x="${2}" r s - shift - - r="${x/|*/}" # the recipient - s="${x/*|/}" # the severity required for notifying this recipient - - # no severity filtering for this person - [ "${r}" = "${s}" ] && return 0 - - # the severity is invalid - s="${s^^}" - if [ "${s}" != "CRITICAL" ]; then - error "SEVERITY FILTERING for ${x} VIA ${method}: invalid severity '${s,,}', only 'critical' is supported." - return 0 - fi - - # create the status tracking directory for this user - [ ! -d "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}" ] && - mkdir -p "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}" - - case "${status}" in - CRITICAL) - # make sure he will get future notifications for this alarm too - touch "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}/${alarm_id}" - debug "SEVERITY FILTERING for ${x} VIA ${method}: ALLOW: the alarm is CRITICAL (will now receive next status change)" - return 0 - ;; - - WARNING) - if [ -f "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}/${alarm_id}" ]; then - # we do not remove the file, so that he will get future notifications of this alarm - debug "SEVERITY FILTERING for ${x} VIA ${method}: ALLOW: recipient has been notified for this alarm in the past (will still receive next status change)" - return 0 - fi - ;; - - *) - if [ -f "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}/${alarm_id}" ]; then - # remove the file, so that he will only receive notifications for CRITICAL states for this alarm - rm "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}/${alarm_id}" - debug "SEVERITY FILTERING for ${x} VIA ${method}: ALLOW: recipient has been notified for this alarm (will only receive CRITICAL notifications from now on)" - return 0 - fi - ;; - esac - - debug "SEVERITY FILTERING for ${x} VIA ${method}: BLOCK: recipient should not receive this notification" - return 1 + local method="${1}" x="${2}" r s + shift + + r="${x/|*/}" # the recipient + s="${x/*|/}" # the severity required for notifying this recipient + + # no severity filtering for this person + [ "${r}" = "${s}" ] && return 0 + + # the severity is invalid + s="${s^^}" + if [ "${s}" != "CRITICAL" ]; then + error "SEVERITY FILTERING for ${x} VIA ${method}: invalid severity '${s,,}', only 'critical' is supported." + return 0 + fi + + # create the status tracking directory for this user + [ ! -d "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}" ] && + mkdir -p "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}" + + case "${status}" in + CRITICAL) + # make sure he will get future notifications for this alarm too + touch "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}/${alarm_id}" + debug "SEVERITY FILTERING for ${x} VIA ${method}: ALLOW: the alarm is CRITICAL (will now receive next status change)" + return 0 + ;; + + WARNING) + if [ -f "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}/${alarm_id}" ]; then + # we do not remove the file, so that he will get future notifications of this alarm + debug "SEVERITY FILTERING for ${x} VIA ${method}: ALLOW: recipient has been notified for this alarm in the past (will still receive next status change)" + return 0 + fi + ;; + + *) + if [ -f "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}/${alarm_id}" ]; then + # remove the file, so that he will only receive notifications for CRITICAL states for this alarm + rm "${NETDATA_CACHE_DIR}/alarm-notify/${method}/${r}/${alarm_id}" + debug "SEVERITY FILTERING for ${x} VIA ${method}: ALLOW: recipient has been notified for this alarm (will only receive CRITICAL notifications from now on)" + return 0 + fi + ;; + esac + + debug "SEVERITY FILTERING for ${x} VIA ${method}: BLOCK: recipient should not receive this notification" + return 1 } # ----------------------------------------------------------------------------- @@ -505,86 +531,110 @@ filter_recipient_by_criticality() { #shellcheck disable=SC2153 { [ -z "${FLEEP_SERVER}" ] || [ -z "${FLEEP_SENDER}" ]; } && SEND_FLEEP="NO" +# check dynatrace +{ [ -z "${DYNATRACE_SPACE}" ] || + [ -z "${DYNATRACE_SERVER}" ] || + [ -z "${DYNATRACE_TOKEN}" ] || + [ -z "${DYNATRACE_TAG_VALUE}" ] || + [ -z "${DYNATRACE_EVENT}" ]; } && SEND_DYNATRACE="NO" + +# check opsgenie +[ -z "${OPSGENIE_API_KEY}" ] && SEND_OPSGENIE="NO" + +# check matrix +{ [ -z "${MATRIX_HOMESERVER}" ] || [ -z "${MATRIX_ACCESSTOKEN}" ]; } && SEND_MATRIX="NO" + +# check stackpulse +[ -z "${STACKPULSE_WEBHOOK}" ] && SEND_STACKPULSE="NO" + if [ "${SEND_PUSHOVER}" = "YES" ] || - [ "${SEND_SLACK}" = "YES" ] || - [ "${SEND_ROCKETCHAT}" = "YES" ] || - [ "${SEND_ALERTA}" = "YES" ] || - [ "${SEND_PD}" = "YES" ] || - [ "${SEND_FLOCK}" = "YES" ] || - [ "${SEND_DISCORD}" = "YES" ] || - [ "${SEND_HIPCHAT}" = "YES" ] || - [ "${SEND_TWILIO}" = "YES" ] || - [ "${SEND_MESSAGEBIRD}" = "YES" ] || - [ "${SEND_KAVENEGAR}" = "YES" ] || - [ "${SEND_TELEGRAM}" = "YES" ] || - [ "${SEND_PUSHBULLET}" = "YES" ] || - [ "${SEND_KAFKA}" = "YES" ] || - [ "${SEND_FLEEP}" = "YES" ] || - [ "${SEND_PROWL}" = "YES" ] || - [ "${SEND_HANGOUTS}" = "YES" ] || - [ "${SEND_CUSTOM}" = "YES" ] || - [ "${SEND_MSTEAM}" = "YES" ]; then - # if we need curl, check for the curl command - if [ -z "${curl}" ]; then - curl="$(command -v curl 2>/dev/null)" - fi - if [ -z "${curl}" ]; then - error "Cannot find curl command in the system path. Disabling all curl based notifications." - SEND_PUSHOVER="NO" - SEND_PUSHBULLET="NO" - SEND_TELEGRAM="NO" - SEND_SLACK="NO" - SEND_MSTEAM="NO" - SEND_ROCKETCHAT="NO" - SEND_ALERTA="NO" - SEND_PD="NO" - SEND_FLOCK="NO" - SEND_DISCORD="NO" - SEND_TWILIO="NO" - SEND_HIPCHAT="NO" - SEND_MESSAGEBIRD="NO" - SEND_KAVENEGAR="NO" - SEND_KAFKA="NO" - SEND_FLEEP="NO" - SEND_PROWL="NO" - SEND_HANGOUTS="NO" - SEND_CUSTOM="NO" - fi + [ "${SEND_SLACK}" = "YES" ] || + [ "${SEND_ROCKETCHAT}" = "YES" ] || + [ "${SEND_ALERTA}" = "YES" ] || + [ "${SEND_PD}" = "YES" ] || + [ "${SEND_FLOCK}" = "YES" ] || + [ "${SEND_DISCORD}" = "YES" ] || + [ "${SEND_HIPCHAT}" = "YES" ] || + [ "${SEND_TWILIO}" = "YES" ] || + [ "${SEND_MESSAGEBIRD}" = "YES" ] || + [ "${SEND_KAVENEGAR}" = "YES" ] || + [ "${SEND_TELEGRAM}" = "YES" ] || + [ "${SEND_PUSHBULLET}" = "YES" ] || + [ "${SEND_KAFKA}" = "YES" ] || + [ "${SEND_FLEEP}" = "YES" ] || + [ "${SEND_PROWL}" = "YES" ] || + [ "${SEND_HANGOUTS}" = "YES" ] || + [ "${SEND_MATRIX}" = "YES" ] || + [ "${SEND_CUSTOM}" = "YES" ] || + [ "${SEND_MSTEAM}" = "YES" ] || + [ "${SEND_DYNATRACE}" = "YES" ] || + [ "${SEND_STACKPULSE}" = "YES" ] || + [ "${SEND_OPSGENIE}" = "YES" ]; then + # if we need curl, check for the curl command + if [ -z "${curl}" ]; then + curl="$(command -v curl 2>/dev/null)" + fi + if [ -z "${curl}" ]; then + error "Cannot find curl command in the system path. Disabling all curl based notifications." + SEND_PUSHOVER="NO" + SEND_PUSHBULLET="NO" + SEND_TELEGRAM="NO" + SEND_SLACK="NO" + SEND_MSTEAM="NO" + SEND_ROCKETCHAT="NO" + SEND_ALERTA="NO" + SEND_PD="NO" + SEND_FLOCK="NO" + SEND_DISCORD="NO" + SEND_TWILIO="NO" + SEND_HIPCHAT="NO" + SEND_MESSAGEBIRD="NO" + SEND_KAVENEGAR="NO" + SEND_KAFKA="NO" + SEND_FLEEP="NO" + SEND_PROWL="NO" + SEND_HANGOUTS="NO" + SEND_MATRIX="NO" + SEND_CUSTOM="NO" + SEND_DYNATRACE="NO" + SEND_STACKPULSE="NO" + SEND_OPSGENIE="NO" + fi fi if [ "${SEND_SMS}" = "YES" ]; then - if [ -z "${sendsms}" ]; then - sendsms="$(command -v sendsms 2>/dev/null)" - fi - if [ -z "${sendsms}" ]; then - SEND_SMS="NO" - fi + if [ -z "${sendsms}" ]; then + sendsms="$(command -v sendsms 2>/dev/null)" + fi + if [ -z "${sendsms}" ]; then + SEND_SMS="NO" + fi fi # if we need sendmail, check for the sendmail command if [ "${SEND_EMAIL}" = "YES" ] && [ -z "${sendmail}" ]; then - sendmail="$(command -v sendmail 2>/dev/null)" - if [ -z "${sendmail}" ]; then - debug "Cannot find sendmail command in the system path. Disabling email notifications." - SEND_EMAIL="NO" - fi + sendmail="$(command -v sendmail 2>/dev/null)" + if [ -z "${sendmail}" ]; then + debug "Cannot find sendmail command in the system path. Disabling email notifications." + SEND_EMAIL="NO" + fi fi # if we need logger, check for the logger command if [ "${SEND_SYSLOG}" = "YES" ] && [ -z "${logger}" ]; then - logger="$(command -v logger 2>/dev/null)" - if [ -z "${logger}" ]; then - debug "Cannot find logger command in the system path. Disabling syslog notifications." - SEND_SYSLOG="NO" - fi + logger="$(command -v logger 2>/dev/null)" + if [ -z "${logger}" ]; then + debug "Cannot find logger command in the system path. Disabling syslog notifications." + SEND_SYSLOG="NO" + fi fi # if we need aws, check for the aws command if [ "${SEND_AWSSNS}" = "YES" ] && [ -z "${aws}" ]; then - aws="$(command -v aws 2>/dev/null)" - if [ -z "${aws}" ]; then - debug "Cannot find aws command in the system path. Disabling Amazon SNS notifications." - SEND_AWSSNS="NO" - fi + aws="$(command -v aws 2>/dev/null)" + if [ -z "${aws}" ]; then + debug "Cannot find aws command in the system path. Disabling Amazon SNS notifications." + SEND_AWSSNS="NO" + fi fi # ----------------------------------------------------------------------------- @@ -593,47 +643,47 @@ fi # netdata may call us with multiple roles, and roles may have multiple but # overlapping recipients - so, here we find the unique recipients. for method_name in ${method_names}; do - send_var="SEND_${method_name^^}" - if [ "${!send_var}" = "NO" ]; then - continue - fi - - declare -A arr_var=() - - for x in ${roles//,/ }; do - # the roles 'silent' and 'disabled' mean: - # don't send a notification for this role - if [ "${x}" = "silent" ] || [ "${x}" = "disabled" ]; then - continue - fi - - role_recipients="role_recipients_${method_name}[$x]" - default_recipient_var="DEFAULT_RECIPIENT_${method_name^^}" - - a="${!role_recipients}" - [ -z "${a}" ] && a="${!default_recipient_var}" - for r in ${a//,/ }; do - [ "${r}" != "disabled" ] && filter_recipient_by_criticality ${method_name} "${r}" && arr_var[${r/|*/}]="1" - done - done - - # build the list of recipients - to_var="to_${method_name}" - declare to_${method_name}="${!arr_var[*]}" - - [ -z "${!to_var}" ] && declare ${send_var}="NO" + send_var="SEND_${method_name^^}" + if [ "${!send_var}" = "NO" ]; then + continue + fi + + declare -A arr_var=() + + for x in ${roles//,/ }; do + # the roles 'silent' and 'disabled' mean: + # don't send a notification for this role + if [ "${x}" = "silent" ] || [ "${x}" = "disabled" ]; then + continue + fi + + role_recipients="role_recipients_${method_name}[$x]" + default_recipient_var="DEFAULT_RECIPIENT_${method_name^^}" + + a="${!role_recipients}" + [ -z "${a}" ] && a="${!default_recipient_var}" + for r in ${a//,/ }; do + [ "${r}" != "disabled" ] && filter_recipient_by_criticality ${method_name} "${r}" && arr_var[${r/|*/}]="1" + done + done + + # build the list of recipients + to_var="to_${method_name}" + declare to_${method_name}="${!arr_var[*]}" + + [ -z "${!to_var}" ] && declare ${send_var}="NO" done # ----------------------------------------------------------------------------- # handle fixup of the email recipient list. fix_to_email() { - to_email= - while [ -n "${1}" ]; do - [ -n "${to_email}" ] && to_email="${to_email}, " - to_email="${to_email}${1}" - shift 1 - done + to_email= + while [ -n "${1}" ]; do + [ -n "${to_email}" ] && to_email="${to_email}, " + to_email="${to_email}${1}" + shift 1 + done } # ${to_email} without quotes here @@ -642,47 +692,52 @@ fix_to_email ${to_email} # ----------------------------------------------------------------------------- # handle output if we're running in unit test mode if [ ${unittest} ]; then - for method_name in ${method_names}; do - to_var="to_${method_name}" - echo "results: ${method_name}: ${!to_var}" - done - exit 0 + for method_name in ${method_names}; do + to_var="to_${method_name}" + echo "results: ${method_name}: ${!to_var}" + done + exit 0 fi # ----------------------------------------------------------------------------- # check that we have at least a method enabled proceed=0 for method in "${SEND_EMAIL}" \ - "${SEND_PUSHOVER}" \ - "${SEND_TELEGRAM}" \ - "${SEND_SLACK}" \ - "${SEND_ROCKETCHAT}" \ - "${SEND_ALERTA}" \ - "${SEND_FLOCK}" \ - "${SEND_DISCORD}" \ - "${SEND_TWILIO}" \ - "${SEND_HIPCHAT}" \ - "${SEND_MESSAGEBIRD}" \ - "${SEND_KAVENEGAR}" \ - "${SEND_PUSHBULLET}" \ - "${SEND_KAFKA}" \ - "${SEND_PD}" \ - "${SEND_FLEEP}" \ - "${SEND_PROWL}" \ - "${SEND_CUSTOM}" \ - "${SEND_IRC}" \ - "${SEND_HANGOUTS}" \ - "${SEND_AWSSNS}" \ - "${SEND_SYSLOG}" \ - "${SEND_SMS}" \ - "${SEND_MSTEAM}"; do - if [ "${method}" == "YES" ]; then - proceed=1 - break - fi + "${SEND_PUSHOVER}" \ + "${SEND_TELEGRAM}" \ + "${SEND_SLACK}" \ + "${SEND_ROCKETCHAT}" \ + "${SEND_ALERTA}" \ + "${SEND_FLOCK}" \ + "${SEND_DISCORD}" \ + "${SEND_TWILIO}" \ + "${SEND_HIPCHAT}" \ + "${SEND_MESSAGEBIRD}" \ + "${SEND_KAVENEGAR}" \ + "${SEND_PUSHBULLET}" \ + "${SEND_KAFKA}" \ + "${SEND_PD}" \ + "${SEND_FLEEP}" \ + "${SEND_PROWL}" \ + "${SEND_MATRIX}" \ + "${SEND_CUSTOM}" \ + "${SEND_IRC}" \ + "${SEND_HANGOUTS}" \ + "${SEND_AWSSNS}" \ + "${SEND_SYSLOG}" \ + "${SEND_SMS}" \ + "${SEND_MSTEAM}" \ + "${SEND_DYNATRACE}" \ + "${SEND_STACKPULSE}" \ + "${SEND_OPSGENIE}" ; do + + if [ "${method}" == "YES" ]; then + proceed=1 + break + fi done if [ "$proceed" -eq 0 ]; then - fatal "All notification methods are disabled. Not sending notification for host '${host}', chart '${chart}' to '${roles}' for '${name}' = '${value}' for status '${status}'." + fatal "All notification methods are disabled. Not sending notification for host '${host}', chart '${chart}' to '${roles}' for '${name}' = '${value}' for status '${status}'." fi # ----------------------------------------------------------------------------- @@ -696,34 +751,34 @@ date=$(date --date=@${when} "${date_format}" 2>/dev/null) # ---------------------------------------------------------------------------- # prepare some extra headers if we've been asked to thread e-mails if [ "${SEND_EMAIL}" == "YES" ] && [ "${EMAIL_THREADING}" != "NO" ]; then - email_thread_headers="In-Reply-To: <${chart}-${name}@${host}>\\r\\nReferences: <${chart}-${name}@${host}>" + email_thread_headers="In-Reply-To: <${chart}-${name}@${host}>\\r\\nReferences: <${chart}-${name}@${host}>" else - email_thread_headers= + email_thread_headers= fi # ----------------------------------------------------------------------------- # function to URL encode a string urlencode() { - local string="${1}" strlen encoded pos c o - - strlen=${#string} - for ((pos = 0; pos < strlen; pos++)); do - c=${string:pos:1} - case "${c}" in - [-_.~a-zA-Z0-9]) - o="${c}" - ;; - - *) - printf -v o '%%%02x' "'${c}" - ;; - esac - encoded+="${o}" - done - - REPLY="${encoded}" - echo "${REPLY}" + local string="${1}" strlen encoded pos c o + + strlen=${#string} + for ((pos = 0; pos < strlen; pos++)); do + c=${string:pos:1} + case "${c}" in + [-_.~a-zA-Z0-9]) + o="${c}" + ;; + + *) + printf -v o '%%%02x' "'${c}" + ;; + esac + encoded+="${o}" + done + + REPLY="${encoded}" + echo "${REPLY}" } # ----------------------------------------------------------------------------- @@ -731,154 +786,154 @@ urlencode() { # using DAYS, MINUTES, SECONDS duration4human() { - local s="${1}" d=0 h=0 m=0 ds="day" hs="hour" ms="minute" ss="second" ret - d=$((s / 86400)) - s=$((s - (d * 86400))) - h=$((s / 3600)) - s=$((s - (h * 3600))) - m=$((s / 60)) - s=$((s - (m * 60))) - - if [ ${d} -gt 0 ]; then - [ ${m} -ge 30 ] && h=$((h + 1)) - [ ${d} -gt 1 ] && ds="days" - [ ${h} -gt 1 ] && hs="hours" - if [ ${h} -gt 0 ]; then - ret="${d} ${ds} and ${h} ${hs}" - else - ret="${d} ${ds}" - fi - elif [ ${h} -gt 0 ]; then - [ ${s} -ge 30 ] && m=$((m + 1)) - [ ${h} -gt 1 ] && hs="hours" - [ ${m} -gt 1 ] && ms="minutes" - if [ ${m} -gt 0 ]; then - ret="${h} ${hs} and ${m} ${ms}" - else - ret="${h} ${hs}" - fi - elif [ ${m} -gt 0 ]; then - [ ${m} -gt 1 ] && ms="minutes" - [ ${s} -gt 1 ] && ss="seconds" - if [ ${s} -gt 0 ]; then - ret="${m} ${ms} and ${s} ${ss}" - else - ret="${m} ${ms}" - fi - else - [ ${s} -gt 1 ] && ss="seconds" - ret="${s} ${ss}" - fi - - REPLY="${ret}" - echo "${REPLY}" + local s="${1}" d=0 h=0 m=0 ds="day" hs="hour" ms="minute" ss="second" ret + d=$((s / 86400)) + s=$((s - (d * 86400))) + h=$((s / 3600)) + s=$((s - (h * 3600))) + m=$((s / 60)) + s=$((s - (m * 60))) + + if [ ${d} -gt 0 ]; then + [ ${m} -ge 30 ] && h=$((h + 1)) + [ ${d} -gt 1 ] && ds="days" + [ ${h} -gt 1 ] && hs="hours" + if [ ${h} -gt 0 ]; then + ret="${d} ${ds} and ${h} ${hs}" + else + ret="${d} ${ds}" + fi + elif [ ${h} -gt 0 ]; then + [ ${s} -ge 30 ] && m=$((m + 1)) + [ ${h} -gt 1 ] && hs="hours" + [ ${m} -gt 1 ] && ms="minutes" + if [ ${m} -gt 0 ]; then + ret="${h} ${hs} and ${m} ${ms}" + else + ret="${h} ${hs}" + fi + elif [ ${m} -gt 0 ]; then + [ ${m} -gt 1 ] && ms="minutes" + [ ${s} -gt 1 ] && ss="seconds" + if [ ${s} -gt 0 ]; then + ret="${m} ${ms} and ${s} ${ss}" + else + ret="${m} ${ms}" + fi + else + [ ${s} -gt 1 ] && ss="seconds" + ret="${s} ${ss}" + fi + + REPLY="${ret}" + echo "${REPLY}" } # ----------------------------------------------------------------------------- # email sender send_email() { - local ret opts=() sender_email="${EMAIL_SENDER}" sender_name= - if [ "${SEND_EMAIL}" = "YES" ]; then - - if [ -n "${EMAIL_SENDER}" ]; then - if [[ ${EMAIL_SENDER} =~ ^\".*\"\ \<.*\>$ ]]; then - # the name includes double quotes - sender_email="$(echo "${EMAIL_SENDER}" | cut -d '<' -f 2 | cut -d '>' -f 1)" - sender_name="$(echo "${EMAIL_SENDER}" | cut -d '"' -f 2)" - elif [[ ${EMAIL_SENDER} =~ ^\'.*\'\ \<.*\>$ ]]; then - # the name includes single quotes - sender_email="$(echo "${EMAIL_SENDER}" | cut -d '<' -f 2 | cut -d '>' -f 1)" - sender_name="$(echo "${EMAIL_SENDER}" | cut -d "'" -f 2)" - elif [[ ${EMAIL_SENDER} =~ ^.*\ \<.*\>$ ]]; then - # the name does not have any quotes - sender_email="$(echo "${EMAIL_SENDER}" | cut -d '<' -f 2 | cut -d '>' -f 1)" - sender_name="$(echo "${EMAIL_SENDER}" | cut -d '<' -f 1)" - fi - fi - - [ -n "${sender_email}" ] && opts+=(-f "${sender_email}") - [ -n "${sender_name}" ] && opts+=(-F "${sender_name}") - - if [ "${debug}" = "1" ]; then - echo >&2 "--- BEGIN sendmail command ---" - printf >&2 "%q " "${sendmail}" -t "${opts[@]}" - echo >&2 - echo >&2 "--- END sendmail command ---" - fi - - "${sendmail}" -t "${opts[@]}" - ret=$? - - if [ ${ret} -eq 0 ]; then - info "sent email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}'" - return 0 - else - error "failed to send email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}' with error code ${ret}." - return 1 - fi - fi - - return 1 + local ret opts=() sender_email="${EMAIL_SENDER}" sender_name= + if [ "${SEND_EMAIL}" = "YES" ]; then + + if [ -n "${EMAIL_SENDER}" ]; then + if [[ ${EMAIL_SENDER} =~ ^\".*\"\ \<.*\>$ ]]; then + # the name includes double quotes + sender_email="$(echo "${EMAIL_SENDER}" | cut -d '<' -f 2 | cut -d '>' -f 1)" + sender_name="$(echo "${EMAIL_SENDER}" | cut -d '"' -f 2)" + elif [[ ${EMAIL_SENDER} =~ ^\'.*\'\ \<.*\>$ ]]; then + # the name includes single quotes + sender_email="$(echo "${EMAIL_SENDER}" | cut -d '<' -f 2 | cut -d '>' -f 1)" + sender_name="$(echo "${EMAIL_SENDER}" | cut -d "'" -f 2)" + elif [[ ${EMAIL_SENDER} =~ ^.*\ \<.*\>$ ]]; then + # the name does not have any quotes + sender_email="$(echo "${EMAIL_SENDER}" | cut -d '<' -f 2 | cut -d '>' -f 1)" + sender_name="$(echo "${EMAIL_SENDER}" | cut -d '<' -f 1)" + fi + fi + + [ -n "${sender_email}" ] && opts+=(-f "${sender_email}") + [ -n "${sender_name}" ] && opts+=(-F "${sender_name}") + + if [ "${debug}" = "1" ]; then + echo >&2 "--- BEGIN sendmail command ---" + printf >&2 "%q " "${sendmail}" -t "${opts[@]}" + echo >&2 + echo >&2 "--- END sendmail command ---" + fi + + "${sendmail}" -t "${opts[@]}" + ret=$? + + if [ ${ret} -eq 0 ]; then + info "sent email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}'" + return 0 + else + error "failed to send email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}' with error code ${ret}." + return 1 + fi + fi + + return 1 } # ----------------------------------------------------------------------------- # pushover sender send_pushover() { - local apptoken="${1}" usertokens="${2}" when="${3}" url="${4}" status="${5}" title="${6}" message="${7}" httpcode sent=0 user priority - - if [ "${SEND_PUSHOVER}" = "YES" ] && [ -n "${apptoken}" ] && [ -n "${usertokens}" ] && [ -n "${title}" ] && [ -n "${message}" ]; then - - # https://pushover.net/api - priority=-2 - case "${status}" in - CLEAR) priority=-1 ;; # low priority: no sound or vibration - WARNING) priority=0 ;; # normal priority: respect quiet hours - CRITICAL) priority=1 ;; # high priority: bypass quiet hours - *) priority=-2 ;; # lowest priority: no notification at all - esac - - for user in ${usertokens}; do - httpcode=$(docurl \ - --form-string "token=${apptoken}" \ - --form-string "user=${user}" \ - --form-string "html=1" \ - --form-string "title=${title}" \ - --form-string "message=${message}" \ - --form-string "timestamp=${when}" \ - --form-string "url=${url}" \ - --form-string "url_title=Open netdata dashboard to view the alarm" \ - --form-string "priority=${priority}" \ - https://api.pushover.net/1/messages.json) - - if [ "${httpcode}" = "200" ]; then - info "sent pushover notification for: ${host} ${chart}.${name} is ${status} to '${user}'" - sent=$((sent + 1)) - else - error "failed to send pushover notification for: ${host} ${chart}.${name} is ${status} to '${user}' with HTTP error code ${httpcode}." - fi - done - - [ ${sent} -gt 0 ] && return 0 - fi - - return 1 + local apptoken="${1}" usertokens="${2}" when="${3}" url="${4}" status="${5}" title="${6}" message="${7}" httpcode sent=0 user priority + + if [ "${SEND_PUSHOVER}" = "YES" ] && [ -n "${apptoken}" ] && [ -n "${usertokens}" ] && [ -n "${title}" ] && [ -n "${message}" ]; then + + # https://pushover.net/api + priority=-2 + case "${status}" in + CLEAR) priority=-1 ;; # low priority: no sound or vibration + WARNING) priority=0 ;; # normal priority: respect quiet hours + CRITICAL) priority=1 ;; # high priority: bypass quiet hours + *) priority=-2 ;; # lowest priority: no notification at all + esac + + for user in ${usertokens}; do + httpcode=$(docurl \ + --form-string "token=${apptoken}" \ + --form-string "user=${user}" \ + --form-string "html=1" \ + --form-string "title=${title}" \ + --form-string "message=${message}" \ + --form-string "timestamp=${when}" \ + --form-string "url=${url}" \ + --form-string "url_title=Open netdata dashboard to view the alarm" \ + --form-string "priority=${priority}" \ + https://api.pushover.net/1/messages.json) + + if [ "${httpcode}" = "200" ]; then + info "sent pushover notification for: ${host} ${chart}.${name} is ${status} to '${user}'" + sent=$((sent + 1)) + else + error "failed to send pushover notification for: ${host} ${chart}.${name} is ${status} to '${user}' with HTTP response status code ${httpcode}." + fi + done + + [ ${sent} -gt 0 ] && return 0 + fi + + return 1 } # ----------------------------------------------------------------------------- # pushbullet sender send_pushbullet() { - local userapikey="${1}" source_device="${2}" recipients="${3}" url="${4}" title="${5}" message="${6}" httpcode sent=0 user - if [ "${SEND_PUSHBULLET}" = "YES" ] && [ -n "${userapikey}" ] && [ -n "${recipients}" ] && [ -n "${message}" ] && [ -n "${title}" ]; then - #https://docs.pushbullet.com/#create-push - for user in ${recipients}; do - httpcode=$(docurl \ - --header 'Access-Token: '${userapikey}'' \ - --header 'Content-Type: application/json' \ - --data-binary @<( - cat <
from the message - message="${message///}" - message="${message//<\/small>/}" - - if [ "${SEND_HIPCHAT}" = "YES" ] && [ -n "${HIPCHAT_SERVER}" ] && [ -n "${authtoken}" ] && [ -n "${recipients}" ] && [ -n "${message}" ]; then - # Valid values: html, text. - # Defaults to 'html'. - msg_format="html" - - # Background color for message. Valid values: yellow, green, red, purple, gray, random. Defaults to 'yellow'. - case "${status}" in - WARNING) color="yellow" ;; - CRITICAL) color="red" ;; - CLEAR) color="green" ;; - *) color="gray" ;; - esac - - # Whether this message should trigger a user notification (change the tab color, play a sound, notify mobile phones, etc). - # Each recipient's notification preferences are taken into account. - # Defaults to false. - notify="true" - - for room in ${recipients}; do - httpcode=$(docurl -X POST \ - -H "Content-type: application/json" \ - -H "Authorization: Bearer ${authtoken}" \ - -d "{\"color\": \"${color}\", \"from\": \"${host}\", \"message_format\": \"${msg_format}\", \"message\": \"${message}\", \"notify\": \"${notify}\"}" \ - "https://${HIPCHAT_SERVER}/v2/room/${room}/notification") - - if [ "${httpcode}" = "204" ]; then - info "sent HipChat notification for: ${host} ${chart}.${name} is ${status} to '${room}'" - sent=$((sent + 1)) - else - error "failed to send HipChat notification for: ${host} ${chart}.${name} is ${status} to '${room}' with HTTP error code ${httpcode}." - fi - done - - [ ${sent} -gt 0 ] && return 0 - fi - - return 1 + local authtoken="${1}" recipients="${2}" message="${3}" httpcode sent=0 room color msg_format notify + + # remove from the message + message="${message///}" + message="${message//<\/small>/}" + + if [ "${SEND_HIPCHAT}" = "YES" ] && [ -n "${HIPCHAT_SERVER}" ] && [ -n "${authtoken}" ] && [ -n "${recipients}" ] && [ -n "${message}" ]; then + # Valid values: html, text. + # Defaults to 'html'. + msg_format="html" + + # Background color for message. Valid values: yellow, green, red, purple, gray, random. Defaults to 'yellow'. + case "${status}" in + WARNING) color="yellow" ;; + CRITICAL) color="red" ;; + CLEAR) color="green" ;; + *) color="gray" ;; + esac + + # Whether this message should trigger a user notification (change the tab color, play a sound, notify mobile phones, etc). + # Each recipient's notification preferences are taken into account. + # Defaults to false. + notify="true" + + for room in ${recipients}; do + httpcode=$(docurl -X POST \ + -H "Content-type: application/json" \ + -H "Authorization: Bearer ${authtoken}" \ + -d "{\"color\": \"${color}\", \"from\": \"${host}\", \"message_format\": \"${msg_format}\", \"message\": \"${message}\", \"notify\": \"${notify}\"}" \ + "https://${HIPCHAT_SERVER}/v2/room/${room}/notification") + + if [ "${httpcode}" = "204" ]; then + info "sent HipChat notification for: ${host} ${chart}.${name} is ${status} to '${room}'" + sent=$((sent + 1)) + else + error "failed to send HipChat notification for: ${host} ${chart}.${name} is ${status} to '${room}' with HTTP response status code ${httpcode}." + fi + done + + [ ${sent} -gt 0 ] && return 0 + fi + + return 1 } # ----------------------------------------------------------------------------- # messagebird sender send_messagebird() { - local accesskey="${1}" messagebirdnumber="${2}" recipients="${3}" title="${4}" message="${5}" httpcode sent=0 user - if [ "${SEND_MESSAGEBIRD}" = "YES" ] && [ -n "${accesskey}" ] && [ -n "${messagebirdnumber}" ] && [ -n "${recipients}" ] && [ -n "${message}" ] && [ -n "${title}" ]; then - #https://developers.messagebird.com/docs/messaging - for user in ${recipients}; do - httpcode=$(docurl -X POST \ - --data-urlencode "originator=${messagebirdnumber}" \ - --data-urlencode "recipients=${user}" \ - --data-urlencode "body=${title} ${message}" \ - --data-urlencode "datacoding=auto" \ - -H "Authorization: AccessKey ${accesskey}" \ - "https://rest.messagebird.com/messages") - - if [ "${httpcode}" = "201" ]; then - info "sent Messagebird SMS for: ${host} ${chart}.${name} is ${status} to '${user}'" - sent=$((sent + 1)) - else - error "failed to send Messagebird SMS for: ${host} ${chart}.${name} is ${status} to '${user}' with HTTP error code ${httpcode}." - fi - done - - [ ${sent} -gt 0 ] && return 0 - fi - - return 1 + local accesskey="${1}" messagebirdnumber="${2}" recipients="${3}" title="${4}" message="${5}" httpcode sent=0 user + if [ "${SEND_MESSAGEBIRD}" = "YES" ] && [ -n "${accesskey}" ] && [ -n "${messagebirdnumber}" ] && [ -n "${recipients}" ] && [ -n "${message}" ] && [ -n "${title}" ]; then + #https://developers.messagebird.com/docs/messaging + for user in ${recipients}; do + httpcode=$(docurl -X POST \ + --data-urlencode "originator=${messagebirdnumber}" \ + --data-urlencode "recipients=${user}" \ + --data-urlencode "body=${title} ${message}" \ + --data-urlencode "datacoding=auto" \ + -H "Authorization: AccessKey ${accesskey}" \ + "https://rest.messagebird.com/messages") + + if [ "${httpcode}" = "201" ]; then + info "sent Messagebird SMS for: ${host} ${chart}.${name} is ${status} to '${user}'" + sent=$((sent + 1)) + else + error "failed to send Messagebird SMS for: ${host} ${chart}.${name} is ${status} to '${user}' with HTTP response status code ${httpcode}." + fi + done + + [ ${sent} -gt 0 ] && return 0 + fi + + return 1 } # ----------------------------------------------------------------------------- # kavenegar sender send_kavenegar() { - local API_KEY="${1}" kavenegarsender="${2}" recipients="${3}" title="${4}" message="${5}" httpcode sent=0 user - if [ "${SEND_KAVENEGAR}" = "YES" ] && [ -n "${API_KEY}" ] && [ -n "${kavenegarsender}" ] && [ -n "${recipients}" ] && [ -n "${message}" ] && [ -n "${title}" ]; then - # http://api.kavenegar.com/v1/{API-KEY}/sms/send.json - for user in ${recipients}; do - httpcode=$(docurl -X POST http://api.kavenegar.com/v1/${API_KEY}/sms/send.json \ - --data-urlencode "sender=${kavenegarsender}" \ - --data-urlencode "receptor=${user}" \ - --data-urlencode "message=${title} ${message}") - - if [ "${httpcode}" = "200" ]; then - info "sent Kavenegar SMS for: ${host} ${chart}.${name} is ${status} to '${user}'" - sent=$((sent + 1)) - else - error "failed to send Kavenegar SMS for: ${host} ${chart}.${name} is ${status} to '${user}' with HTTP error code ${httpcode}." - fi - done - - [ ${sent} -gt 0 ] && return 0 - fi - - return 1 + local API_KEY="${1}" kavenegarsender="${2}" recipients="${3}" title="${4}" message="${5}" httpcode sent=0 user + if [ "${SEND_KAVENEGAR}" = "YES" ] && [ -n "${API_KEY}" ] && [ -n "${kavenegarsender}" ] && [ -n "${recipients}" ] && [ -n "${message}" ] && [ -n "${title}" ]; then + # http://api.kavenegar.com/v1/{API-KEY}/sms/send.json + for user in ${recipients}; do + httpcode=$(docurl -X POST http://api.kavenegar.com/v1/${API_KEY}/sms/send.json \ + --data-urlencode "sender=${kavenegarsender}" \ + --data-urlencode "receptor=${user}" \ + --data-urlencode "message=${title} ${message}") + + if [ "${httpcode}" = "200" ]; then + info "sent Kavenegar SMS for: ${host} ${chart}.${name} is ${status} to '${user}'" + sent=$((sent + 1)) + else + error "failed to send Kavenegar SMS for: ${host} ${chart}.${name} is ${status} to '${user}' with HTTP response status code ${httpcode}." + fi + done + + [ ${sent} -gt 0 ] && return 0 + fi + + return 1 } # ----------------------------------------------------------------------------- # telegram sender send_telegram() { - local bottoken="${1}" chatids="${2}" message="${3}" httpcode sent=0 chatid emoji disableNotification="" - - if [ "${status}" = "CLEAR" ]; then disableNotification="--data-urlencode disable_notification=true"; fi - - case "${status}" in - WARNING) emoji="⚠️" ;; - CRITICAL) emoji="🔴" ;; - CLEAR) emoji="✅" ;; - *) emoji="⚪️" ;; - esac - - if [ "${SEND_TELEGRAM}" = "YES" ] && [ -n "${bottoken}" ] && [ -n "${chatids}" ] && [ -n "${message}" ]; then - for chatid in ${chatids}; do - # https://core.telegram.org/bots/api#sendmessage - httpcode=$(docurl ${disableNotification} \ - --data-urlencode "parse_mode=HTML" \ - --data-urlencode "disable_web_page_preview=true" \ - --data-urlencode "text=${emoji} ${message}" \ - "https://api.telegram.org/bot${bottoken}/sendMessage?chat_id=${chatid}") - - if [ "${httpcode}" = "200" ]; then - info "sent telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}'" - sent=$((sent + 1)) - elif [ "${httpcode}" = "401" ]; then - error "failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}': Wrong bot token." - else - error "failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}' with HTTP error code ${httpcode}." - fi - done - - [ ${sent} -gt 0 ] && return 0 - fi - - return 1 + local bottoken="${1}" chatids="${2}" message="${3}" httpcode sent=0 chatid emoji disableNotification="" + + if [ "${status}" = "CLEAR" ]; then disableNotification="--data-urlencode disable_notification=true"; fi + + case "${status}" in + WARNING) emoji="⚠️" ;; + CRITICAL) emoji="🔴" ;; + CLEAR) emoji="✅" ;; + *) emoji="⚪️" ;; + esac + + if [ "${SEND_TELEGRAM}" = "YES" ] && [ -n "${bottoken}" ] && [ -n "${chatids}" ] && [ -n "${message}" ]; then + for chatid in ${chatids}; do + # https://core.telegram.org/bots/api#sendmessage + httpcode=$(docurl ${disableNotification} \ + --data-urlencode "parse_mode=HTML" \ + --data-urlencode "disable_web_page_preview=true" \ + --data-urlencode "text=${emoji} ${message}" \ + "https://api.telegram.org/bot${bottoken}/sendMessage?chat_id=${chatid}") + + if [ "${httpcode}" = "200" ]; then + info "sent telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}'" + sent=$((sent + 1)) + elif [ "${httpcode}" = "401" ]; then + error "failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}': Wrong bot token." + else + error "failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}' with HTTP response status code ${httpcode}." + fi + done + + [ ${sent} -gt 0 ] && return 0 + fi + + return 1 } # ----------------------------------------------------------------------------- @@ -1163,22 +1265,22 @@ send_telegram() { send_msteam() { - local webhook="${1}" channels="${2}" httpcode sent=0 channel color payload + local webhook="${1}" channels="${2}" httpcode sent=0 channel color payload - [ "${SEND_MSTEAM}" != "YES" ] && return 1 + [ "${SEND_MSTEAM}" != "YES" ] && return 1 - case "${status}" in - WARNING) icon="${MSTEAM_ICON_WARNING}" && color="${MSTEAM_COLOR_WARNING}" ;; - CRITICAL) icon="${MSTEAM_ICON_CRITICAL}" && color="${MSTEAM_COLOR_CRITICAL}" ;; - CLEAR) icon="${MSTEAM_ICON_CLEAR}" && color="${MSTEAM_COLOR_CLEAR}" ;; - *) icon="${MSTEAM_ICON_DEFAULT}" && color="${MSTEAM_COLOR_DEFAULT}" ;; - esac + case "${status}" in + WARNING) icon="${MSTEAM_ICON_WARNING}" && color="${MSTEAM_COLOR_WARNING}" ;; + CRITICAL) icon="${MSTEAM_ICON_CRITICAL}" && color="${MSTEAM_COLOR_CRITICAL}" ;; + CLEAR) icon="${MSTEAM_ICON_CLEAR}" && color="${MSTEAM_COLOR_CLEAR}" ;; + *) icon="${MSTEAM_ICON_DEFAULT}" && color="${MSTEAM_COLOR_DEFAULT}" ;; + esac - for channel in ${channels}; do - ## More details are available here regarding the payload syntax options : https://docs.microsoft.com/en-us/outlook/actionable-messages/message-card-reference - ## Online designer : https://acdesignerbeta.azurewebsites.net/ - payload="$( - cat </dev/null; then - info "sent Amazon SNS notification for: ${host} ${chart}.${name} is ${status} to '${target}'" - sent=$((sent + 1)) - else - error "failed to send Amazon SNS notification for: ${host} ${chart}.${name} is ${status} to '${target}'" - fi - done + for target in ${targets}; do + # Extract the region from the target ARN. We need to explicitly specify the region so that it matches up correctly. + region="$(echo ${target} | cut -f 4 -d ':')" + if ${aws} sns publish --region "${region}" --subject "${host} ${status_message} - ${name//_/ } - ${chart}" --message "${message}" --target-arn ${target} &>/dev/null; then + info "sent Amazon SNS notification for: ${host} ${chart}.${name} is ${status} to '${target}'" + sent=$((sent + 1)) + else + error "failed to send Amazon SNS notification for: ${host} ${chart}.${name} is ${status} to '${target}'" + fi + done - [ ${sent} -gt 0 ] && return 0 + [ ${sent} -gt 0 ] && return 0 - return 1 + return 1 +} + +# ----------------------------------------------------------------------------- +# Matrix sender + +send_matrix() { + local homeserver="${1}" webhook accesstoken rooms="${2}" httpcode sent=0 payload + + [ "${SEND_MATRIX}" != "YES" ] && return 1 + [ -z "${MATRIX_ACCESSTOKEN}" ] && return 1 + + accesstoken="${MATRIX_ACCESSTOKEN}" + + case "${status}" in + WARNING) emoji="⚠️" ;; + CRITICAL) emoji="🔴" ;; + CLEAR) emoji="✅" ;; + *) emoji="⚪️" ;; + esac + + for room in ${rooms}; do + webhook="$homeserver/_matrix/client/r0/rooms/$(urlencode $room)/send/m.room.message?access_token=$accesstoken" + payload="$( + cat <${name//_/ }
${chart} (${family})
${alarm}
${info}", + "body": "${emoji} ${host} ${status_message} - ${name//_/ } ${chart} (${family}) ${goto_url} ${alarm} ${info}" + } +EOF + )" + + httpcode=$(docurl -X POST --data "${payload}" "${webhook}") + if [ "${httpcode}" == "200" ]; then + info "sent Matrix notification for: ${host} ${chart}.${name} is ${status} to '${room}'" + sent=$((sent + 1)) + else + error "failed to send Matrix notification for: ${host} ${chart}.${name} is ${status} to '${room}', with HTTP response status code ${httpcode}." + fi + done + + [ ${sent} -gt 0 ] && return 0 + + return 1 } # ----------------------------------------------------------------------------- # syslog sender send_syslog() { - local facility=${SYSLOG_FACILITY:-"local6"} level='info' targets="${1}" - local priority='' message='' host='' port='' prefix='' - local temp1='' temp2='' - - [ "${SEND_SYSLOG}" = "YES" ] || return 1 - - if [ "${status}" = "CRITICAL" ]; then - level='crit' - elif [ "${status}" = "WARNING" ]; then - level='warning' - fi - - for target in ${targets}; do - priority="${facility}.${level}" - message='' - host='' - port='' - prefix='' - temp1='' - temp2='' - - prefix=$(echo ${target} | cut -d '/' -f 2) - temp1=$(echo ${target} | cut -d '/' -f 1) - - if [ ${prefix} != ${temp1} ]; then - if (echo ${temp1} | grep -q '@'); then - temp2=$(echo ${temp1} | cut -d '@' -f 1) - host=$(echo ${temp1} | cut -d '@' -f 2) - - if [ ${temp2} != ${host} ]; then - priority=${temp2} - fi - - port=$(echo ${host} | rev | cut -d ':' -f 1 | rev) - - if (echo ${host} | grep -E -q '\[.*\]'); then - if (echo ${port} | grep -q ']'); then - port='' - else - host=$(echo ${host} | rev | cut -d ':' -f 2- | rev) - fi - else - if [ ${port} = ${host} ]; then - port='' - else - host=$(echo ${host} | cut -d ':' -f 1) - fi - fi - else - priority=${temp1} - fi - fi - - message="${prefix} ${status} on ${host} at ${date}: ${chart} ${value_string}" - - if [ ${host} ]; then - logger_options="${logger_options} -n ${host}" - if [ ${port} ]; then - logger_options="${logger_options} -P ${port}" - fi - fi - - ${logger} -p ${priority} ${logger_options} "${message}" - done - - return $? + local facility=${SYSLOG_FACILITY:-"local6"} level='info' targets="${1}" + local priority='' message='' server='' port='' prefix='' + local temp1='' temp2='' + + [ "${SEND_SYSLOG}" = "YES" ] || return 1 + + if [ "${status}" = "CRITICAL" ]; then + level='crit' + elif [ "${status}" = "WARNING" ]; then + level='warning' + fi + + for target in ${targets}; do + priority="${facility}.${level}" + message='' + server='' + port='' + prefix='' + temp1='' + temp2='' + + prefix=$(echo ${target} | cut -d '/' -f 2) + temp1=$(echo ${target} | cut -d '/' -f 1) + + if [ ${prefix} != ${temp1} ]; then + if (echo ${temp1} | grep -q '@'); then + temp2=$(echo ${temp1} | cut -d '@' -f 1) + server=$(echo ${temp1} | cut -d '@' -f 2) + + if [ ${temp2} != ${server} ]; then + priority=${temp2} + fi + + port=$(echo ${server} | rev | cut -d ':' -f 1 | rev) + + if (echo ${server} | grep -E -q '\[.*\]'); then + if (echo ${port} | grep -q ']'); then + port='' + else + server=$(echo ${server} | rev | cut -d ':' -f 2- | rev) + fi + else + if [ ${port} = ${server} ]; then + port='' + else + server=$(echo ${server} | cut -d ':' -f 1) + fi + fi + else + priority=${temp1} + fi + fi + + message="${prefix} ${status} on ${host} at ${date}: ${chart} ${value_string}" + + if [ ${server} ]; then + logger_options="${logger_options} -n ${server}" + if [ ${port} ]; then + logger_options="${logger_options} -P ${port}" + fi + fi + + ${logger} -p ${priority} ${logger_options} "${message}" + done + + return $? } # ----------------------------------------------------------------------------- # SMS sender send_sms() { - local recipients="${1}" errcode errmessage sent=0 + local recipients="${1}" errcode errmessage sent=0 # Human readable SMS local msg="${host} ${status_message}: ${chart} (${family}), ${alarm}" @@ -1765,129 +1912,283 @@ send_sms() { # limit it to 160 characters msg="${msg:0:160}" - if [ "${SEND_SMS}" = "YES" ] && [ -n "${sendsms}" ] && [ -n "${recipients}" ] && [ -n "${msg}" ]; then - # http://api.kavenegar.com/v1/{API-KEY}/sms/send.json - for phone in ${recipients}; do - errmessage=$($sendsms $phone "$msg" 2>&1) - errcode=$? - if [ ${errcode} -eq 0 ]; then - info "sent smstools3 SMS for: ${host} ${chart}.${name} is ${status} to '${user}'" - sent=$((sent + 1)) - else - error "failed to send smstools3 SMS for: ${host} ${chart}.${name} is ${status} to '${user}' with error code ${errcode}: ${errmessage}." - fi - done - - [ ${sent} -gt 0 ] && return 0 - fi - - return 1 + if [ "${SEND_SMS}" = "YES" ] && [ -n "${sendsms}" ] && [ -n "${recipients}" ] && [ -n "${msg}" ]; then + # http://api.kavenegar.com/v1/{API-KEY}/sms/send.json + for phone in ${recipients}; do + errmessage=$($sendsms $phone "$msg" 2>&1) + errcode=$? + if [ ${errcode} -eq 0 ]; then + info "sent smstools3 SMS for: ${host} ${chart}.${name} is ${status} to '${user}'" + sent=$((sent + 1)) + else + error "failed to send smstools3 SMS for: ${host} ${chart}.${name} is ${status} to '${user}' with error code ${errcode}: ${errmessage}." + fi + done + + [ ${sent} -gt 0 ] && return 0 + fi + + return 1 } # ----------------------------------------------------------------------------- # hangouts sender send_hangouts() { - local rooms="${1}" httpcode sent=0 room color payload webhook - - [ "${SEND_HANGOUTS}" != "YES" ] && return 1 - - case "${status}" in - WARNING) color="#ffa700" ;; - CRITICAL) color="#d62d20" ;; - CLEAR) color="#008744" ;; - *) color="#777777" ;; - esac - - for room in ${rooms}; do - if [ -z "${HANGOUTS_WEBHOOK_URI[$room]}" ] ; then - info "Can't send Hangouts notification for: ${host} ${chart}.${name} to room ${room}. HANGOUTS_WEBHOOK_URI[$room] not defined" - else - webhook="${HANGOUTS_WEBHOOK_URI[$room]}" - payload="$( - cat <${host}", - "widgets": [ - { - "keyValue": { - "topLabel": "Status Message", - "content": "${status_message}", - "contentMultiline": "true", - "iconUrl": "${image}", - "onClick": { - "openLink": { - "url": "${goto_url}" - } - } - } - }, - { - "keyValue": { - "topLabel": "${chart} | ${family}", - "content": "${alarm}", - "contentMultiline": "true" - } - } - ] - }, - { - "widgets": [ - { - "textParagraph": { - "text": "@ ${date}\n${info}" - } - } - ] - }, - { - "widgets": [ - { - "buttons": [ - { - "textButton": { - "text": "Go to ${host}", - "onClick": { - "openLink": { - "url": "${goto_url}" - } - } - } - } - ] - } - ] - } - ] - } - ] - } + local rooms="${1}" httpcode sent=0 room color payload webhook thread + + [ "${SEND_HANGOUTS}" != "YES" ] && return 1 + + case "${status}" in + WARNING) color="#ffa700" ;; + CRITICAL) color="#d62d20" ;; + CLEAR) color="#008744" ;; + *) color="#777777" ;; + esac + + for room in ${rooms}; do + if [ -z "${HANGOUTS_WEBHOOK_URI[$room]}" ] ; then + info "Can't send Hangouts notification for: ${host} ${chart}.${name} to room ${room}. HANGOUTS_WEBHOOK_URI[$room] not defined" + else + if [ -n "${HANGOUTS_WEBHOOK_THREAD[$room]}" ]; then + thread="\"name\" : \"${HANGOUTS_WEBHOOK_THREAD[$room]}\"" + fi + webhook="${HANGOUTS_WEBHOOK_URI[$room]}" + payload="$( + cat <${host}", + "widgets": [ + { + "keyValue": { + "topLabel": "Status Message", + "content": "${status_message}", + "contentMultiline": "true", + "iconUrl": "${image}", + "onClick": { + "openLink": { + "url": "${goto_url}" + } + } + } + }, + { + "keyValue": { + "topLabel": "${chart} | ${family}", + "content": "${alarm}", + "contentMultiline": "true" + } + } + ] + }, + { + "widgets": [ + { + "textParagraph": { + "text": "@ ${date}\n${info}" + } + } + ] + }, + { + "widgets": [ + { + "buttons": [ + { + "textButton": { + "text": "Go to ${host}", + "onClick": { + "openLink": { + "url": "${goto_url}" + } + } + } + } + ] + } + ] + } + ] + } + ], + "thread": { + $thread + } + } EOF - )" + )" - httpcode=$(docurl -H "Content-Type: application/json" -X POST -d "${payload}" "${webhook}") + httpcode=$(docurl -H "Content-Type: application/json" -X POST -d "${payload}" "${webhook}") - if [ "${httpcode}" = "200" ]; then - info "sent hangouts notification for: ${host} ${chart}.${name} is ${status} to '${room}'" - sent=$((sent + 1)) - else - error "failed to send hangouts notification for: ${host} ${chart}.${name} is ${status} to '${room}', with HTTP error code ${httpcode}." - fi - fi - done + if [ "${httpcode}" = "200" ]; then + info "sent hangouts notification for: ${host} ${chart}.${name} is ${status} to '${room}'" + sent=$((sent + 1)) + else + error "failed to send hangouts notification for: ${host} ${chart}.${name} is ${status} to '${room}', with HTTP response status code ${httpcode}." + fi + fi + done - [ ${sent} -gt 0 ] && return 0 + [ ${sent} -gt 0 ] && return 0 + + return 1 +} - return 1 +# ----------------------------------------------------------------------------- +# Dynatrace sender + +send_dynatrace() { + [ "${SEND_DYNATRACE}" != "YES" ] && return 1 + + local dynatrace_url="${DYNATRACE_SERVER}/e/${DYNATRACE_SPACE}/api/v1/events" + local description="NetData Notification for: ${host} ${chart}.${name} is ${status}" + local payload="" + + payload=$(cat < + # alerta.io The [Alerta](https://alerta.io) monitoring system is a tool used to @@ -23,7 +29,7 @@ The easiest way to install Alerta is to use the Docker image available on [Docker hub][1]. Alternatively, follow the ["getting started"][2] tutorial to deploy Alerta to an Ubuntu server. More advanced configurations are out os scope of this tutorial but information -about different deployment scenaries can be found in the [docs][3]. +about different deployment scenarios can be found in the [docs][3]. [1]: https://hub.docker.com/r/alerta/alerta-web/ @@ -80,7 +86,7 @@ We can test alarms using the standard approach: Note: Netdata will send 3 alarms, and because last alarm is "CLEAR" you will not see them in main Alerta page, you need to select to see -"closed" alarma in top-right lookup. A little change in `alarm-notify.sh` +"closed" alarm in top-right lookup. A little change in `alarm-notify.sh` that let us test each state one by one will be useful. For more information see diff --git a/health/notifications/awssns/README.md b/health/notifications/awssns/README.md index ed838dacb..c68251379 100644 --- a/health/notifications/awssns/README.md +++ b/health/notifications/awssns/README.md @@ -1,3 +1,9 @@ + + # Amazon SNS As part of it's AWS suite, Amazon provides a notification broker service called 'Simple Notification Service' or SNS. Amazon SNS works kind of similarly to Netdata's own notification system, allowing dispatch of a single notification to multiple subscribers of different types. Among other things, SNS supports sending notifications to: @@ -13,7 +19,9 @@ To get this working, you will need: - The Amazon Web Services CLI tools. Most distributions provide these with the package name `awscli`. - An actual home directory for the user you run Netdata as, instead of just using `/` as a home directory. Setup of this is distribution specific. `/var/lib/netdata` is the recommended directory (because the permissions will already be correct) if you are using a dedicated user (which is how most distributions work). -- An Amazon SNS topic to send notifications to with one or more subscribers. The [Getting Started](https://docs.aws.amazon.com/sns/latest/dg/GettingStarted.html) section of the Amazon SNS documentation covers the basics of how to set this up. Make note of the Topic ARN when you create the topic. +- An Amazon SNS topic to send notifications to with one or more subscribers. The [Getting + Started](https://docs.aws.amazon.com/sns/latest/dg/sns-getting-started.html) section of the Amazon SNS documentation + covers the basics of how to set this up. Make note of the Topic ARN when you create the topic. - While not mandatory, it is highly recommended to create a dedicated IAM user on your account for Netdata to send notifications. This user needs to have programmatic access, and should only allow access to SNS. If you're really paranoid, you can create one for each system or group of systems. Once you have all the above, run the following command as the user Netdata runs under: @@ -29,7 +37,7 @@ Once that's done, you're ready to go and can specify the desired topic ARN as a Notes: - Netdata's native email notification support is far better in almost all respects than it's support through Amazon SNS. If you want email notifications, use the native support, not SNS. - - If you need to change the notification format for SNS notifications, you can do so by specifying the format in `AWSSNS_MESSAGE_FORMAT` in the configuration. This variable supports all the same vairiables you can use in custom notifications. + - If you need to change the notification format for SNS notifications, you can do so by specifying the format in `AWSSNS_MESSAGE_FORMAT` in the configuration. This variable supports all the same variables you can use in custom notifications. - While Amazon SNS supports sending differently formatted messages for different delivery methods, Netdata does not currently support this functionality. [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2Fnotifications%2Fawssns%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/health/notifications/custom/README.md b/health/notifications/custom/README.md index 18df79541..04376d555 100644 --- a/health/notifications/custom/README.md +++ b/health/notifications/custom/README.md @@ -1,8 +1,13 @@ + + # Custom Netdata allows you to send custom notifications to any endpoint you choose. -To configure custom notifications, you will need to customize `health_alarm_notify.conf`. You can look at the other senders in `/usr/libexec/netdata/plugins.d/alarm-notify.sh` for examples of how to modify the `custom_sender()` function in `health_alarm_notify.conf`. Ensure you follow the instructions of changing any configuration file to [persist your configuration](../../../docs/configuration-guide.md#persist-my-configuration). +To configure custom notifications, you will need to customize `health_alarm_notify.conf`. You can look at the other senders in `/usr/libexec/netdata/plugins.d/alarm-notify.sh` for examples of how to modify the `custom_sender()` function in `health_alarm_notify.conf`. Ensure you follow the instructions of changing any configuration file to [persist your configuration](/docs/configuration-guide.md#persist-my-configuration). As with other notifications, you will also need to define the recipient list in `DEFAULT_RECIPIENT_CUSTOM` and/or the `role_recipients_custom` array. diff --git a/health/notifications/discord/README.md b/health/notifications/discord/README.md index 88e0a970e..1650d9cec 100644 --- a/health/notifications/discord/README.md +++ b/health/notifications/discord/README.md @@ -1,3 +1,8 @@ + + # Discordapp.com This is what you will get: diff --git a/health/notifications/dynatrace/Makefile.inc b/health/notifications/dynatrace/Makefile.inc new file mode 100644 index 000000000..a2ae623fb --- /dev/null +++ b/health/notifications/dynatrace/Makefile.inc @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +# THIS IS NOT A COMPLETE Makefile +# IT IS INCLUDED BY ITS PARENT'S Makefile.am +# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT + +# install these files +dist_noinst_DATA += \ + dynatrace/README.md \ + dynatrace/Makefile.inc \ + $(NULL) + diff --git a/health/notifications/dynatrace/README.md b/health/notifications/dynatrace/README.md new file mode 100644 index 000000000..cc82ee78d --- /dev/null +++ b/health/notifications/dynatrace/README.md @@ -0,0 +1,36 @@ + + +# Dynatrace + +Dynatrace allows you to receive notifications using their Events REST API. + +See [the Dynatrace documentation](https://www.dynatrace.com/support/help/extend-dynatrace/dynatrace-api/environment-api/events/post-event/) about POSTing an event in the Events API for more details. + + + +You need: + +1. Dynatrace Server. You can use the same on all your Netdata servers but make sure the server is network visible from your Netdata hosts. +The Dynatrace server should be with protocol prefixed (`http://` or `https://`). For example: `https://monitor.example.com` +This is a required parameter. +2. API Token. Generate a secure access API token that enables access to your Dynatrace monitoring data via the REST-based API. +Generate a Dynatrace API authentication token. On your Dynatrace server, go to **Settings** --> **Integration** --> **Dynatrace API** --> **Generate token**. +See [Dynatrace API - Authentication](https://www.dynatrace.com/support/help/extend-dynatrace/dynatrace-api/basics/dynatrace-api-authentication/) for more details. +This is a required parameter. +3. API Space. This is the URL part of the page you have access in order to generate the API Token. For example, the URL + for a generated API token might look like: + `https://monitor.illumineit.com/e/2a93fe0e-4cd5-469a-9d0d-1a064235cfce/#settings/integration/apikeys;gf=all` In that + case, my space is _2a93fe0e-4cd5-469a-9d0d-1a064235cfce_ This is a required parameter. +4. Generate a Server Tag. On your Dynatrace Server, go to **Settings** --> **Tags** --> **Manually applied tags** and create the Tag. +The Netdata alarm is sent as a Dynatrace Event to be correlated with all those hosts tagged with this Tag you have created. +This is a required parameter. +5. Specify the Dynatrace event. This can be one of `CUSTOM_INFO`, `CUSTOM_ANNOTATION`, `CUSTOM_CONFIGURATION`, and `CUSTOM_DEPLOYMENT`. +The default value is `CUSTOM_INFO`. +This is a required parameter. +6. Specify the annotation type. This is the source of the Dynatrace event. Put whatever it fits you, for example, +_Netdata Alarm_, which is also the default value. + +[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2Fnotifications%2Fdynatrace%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/health/notifications/email/README.md b/health/notifications/email/README.md index bf03887ac..827a9c0be 100644 --- a/health/notifications/email/README.md +++ b/health/notifications/email/README.md @@ -1,3 +1,8 @@ + + # Email You need a working `sendmail` command for email alerts to work. Almost all MTAs provide a `sendmail` interface. @@ -50,7 +55,7 @@ sendmail="/usr/bin/msmtp" (sudo) su -s /bin/bash netdata ``` - Configure `~/.msmtprc` as shown [in the documentation](https://marlam.de/msmtp/documentation/). -- Finaly set the appropriate permissions on the `.msmtprc` file : +- Finally set the appropriate permissions on the `.msmtprc` file : ```sh chmod 600 ~/.msmtprc ``` diff --git a/health/notifications/flock/README.md b/health/notifications/flock/README.md index 658fc7b82..b24ecdb25 100644 --- a/health/notifications/flock/README.md +++ b/health/notifications/flock/README.md @@ -1,4 +1,9 @@ -# flock.com + + +# Flock This is what you will get: diff --git a/health/notifications/hangouts/README.md b/health/notifications/hangouts/README.md index 25dccad61..886abfc43 100644 --- a/health/notifications/hangouts/README.md +++ b/health/notifications/hangouts/README.md @@ -1,16 +1,33 @@ -# Hangouts Chat + + +# Send notifications to Google Hangouts + +[Google Hangouts](https://hangouts.google.com/) is a cross-platform messaging app developed by Google. You can configure +Netdata to send alarm notifications to a Hangouts room in order to stay aware of possible health or performance issues +on your nodes. Here's an example of the notification in action: -This is what you will get: ![Netdata on Hangouts](https://user-images.githubusercontent.com/1153921/66427166-47de6900-e9c8-11e9-8322-b4b03f084dc1.png) + To receive notifications in Google Hangouts, you need the following in your Hangouts setup: -1. One or more rooms -2. An **incoming webhook** for each room +1. One or more rooms. +2. An **incoming webhook** for each room. + +Follow [Google's documentation](https://developers.google.com/hangouts/chat/how-tos/webhooks) to create an incoming +webhook for each room you want to send Netdata notifications to. -How to create an incoming webhook: -https://developers.google.com/hangouts/chat/how-tos/webhooks +Set the webhook URIs and room names in `health_alarm_notify.conf`. To edit it on your system, run +`/etc/netdata/edit-config health_alarm_notify.conf`): -Set the webhook URIs and room names in `health_alarm_notify.conf`. To edit it on your system, run `/etc/netdata/edit-config health_alarm_notify.conf`): +## Threads (optional) + +Instead to receive alarms on different threads, Netdata allows you to concentrate them inside an unique thread when you +set the variable `HANGOUTS_WEBHOOK_THREAD[NAME]`. ``` #------------------------------------------------------------------------------ @@ -23,11 +40,16 @@ SEND_HANGOUTS="YES" # HANGOUTS_WEBHOOK_URI[ROOM_NAME]="URLforroom1" HANGOUTS_WEBHOOK_URI[systems]="https://chat.googleapis.com/v1/spaces/AAAAXXXXXXX/..." HANGOUTS_WEBHOOK_URI[development]="https://chat.googleapis.com/v1/spaces/AAAAYYYYY/..." +# On Hangouts, copy a thread link and change the values for space and thread +# HANGOUTS_WEBHOOK_THREAD[systems]="spaces/AAAAXXXXXXX/threads/XXXXXXXXXXX" # if a DEFAULT_RECIPIENT_HANGOUTS are not configured, # notifications wouldn't be send to hangouts rooms. # DEFAULT_RECIPIENT_HANGOUTS="systems development|critical" DEFAULT_RECIPIENT_HANGOUTS="sysadmin devops alarms|critical" ``` + You can define multiple rooms like this: `sysadmin devops alarms|critical`. -The keywords `sysadmin`, `devops` and `alarms` are Hangouts rooms. +The keywords `sysadmin`, `devops`, and `alarms` are Hangouts rooms. + +[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2Fnotifications%2Fhangouts%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/health/notifications/health_alarm_notify.conf b/health/notifications/health_alarm_notify.conf index 5540dfc13..827a47d99 100755 --- a/health/notifications/health_alarm_notify.conf +++ b/health/notifications/health_alarm_notify.conf @@ -76,8 +76,8 @@ date_format='' # the line below to have Netdata instead use the host's fully qualified # domain name. # -# This does not report correct FQDN's for slave systems for which this -# sytem is a master. +# This does not report correct FQDN's for child systems for which this +# system is a parent. # # Additionally, if the system host name is overridden in /etc/netdata.conf # with the `hostname` option, that name will be used unconditionally @@ -163,7 +163,10 @@ sendsms="" # You can append |critical to limit the notifications to be sent. # # In these examples, the first recipient receives all the alarms -# while the second one receives only the critical ones: +# while the second one receives only notifications for alarms that +# have at some point become critical. The second user may still receive +# warning and clear notifications, but only for the event that previously +# caused a critical alarm. # # email : "user1@example.com user2@example.com|critical" # pushover : "2987343...9437837 8756278...2362736|critical" @@ -229,6 +232,61 @@ DEFAULT_RECIPIENT_EMAIL="root" # to not send HTML but Plain Text only emails. #EMAIL_PLAINTEXT_ONLY="YES" +#------------------------------------------------------------------------------ +# Dynatrace global notification options +#------------------------------------------------------------------------------ +# enable/disable sending Dynatrace notifications +SEND_DYNATRACE="YES" + +# The Dynatrace server with protocol prefix (http:// or https://), example https://monitor.illumineit.com +# Required +DYNATRACE_SERVER="" + +# Generate a Dynatrace API authentication token +# Read https://www.dynatrace.com/support/help/extend-dynatrace/dynatrace-api/basics/dynatrace-api-authentication/ +# On Dynatrace server goto Settings --> Integration --> Dynatrace API --> Generate token +# Required +DYNATRACE_TOKEN="" + +# Beware: Space is taken from dynatrace URL from browser when you create the TOKEN +# Required +DYNATRACE_SPACE="" + +# Generate a Server Tag. On the Dynatrace Server go to Settings --> Tags --> Manually applied tags create the Tag +# The NetData alarm will be sent as a Dynatrace Event to be correlated with all those hosts tagged with this Tag +# you created. +# Required +DYNATRACE_TAG_VALUE="" + +# Change this to what you want +DYNATRACE_ANNOTATION_TYPE="NetData Alarm" + +# This can be CUSTOM_INFO, CUSTOM_ANNOTATION, CUSTOM_CONFIGURATION, CUSTOM_DEPLOYMENT +# Applying default value +# Required +DYNATRACE_EVENT="CUSTOM_INFO" + + +DEFAULT_RECIPIENT_DYNATRACE="" + +#------------------------------------------------------------------------------ +# Stackpulse global notification options +SEND_STACKPULSE="YES" + +# Webhook +STACKPULSE_WEBHOOK="" + +DEFAULT_RECIPIENT_STACKPULSE="" + +#------------------------------------------------------------------------------ +# opsgenie global notification options +SEND_OPSGENIE="YES" + +# Api key +OPSGENIE_API_KEY="" + +DEFAULT_RECIPIENT_OPSGENIE="" + #------------------------------------------------------------------------------ # hangouts (google hangouts chat) global notification options @@ -572,6 +630,9 @@ SEND_PD="YES" # (empty = do not send a notification for unconfigured roles): DEFAULT_RECIPIENT_PD="" +# Which PD API are we going to use? For version 2 or newer, it is necessary to do a request for Pagerduty +# before to set the version(https://developer.pagerduty.com/docs/events-api-v2/overview/). +USE_PD_VERSION="1" #------------------------------------------------------------------------------ # fleep notification options @@ -739,6 +800,26 @@ SEND_SMS="YES" DEFAULT_RECIPIENT_SMS="" +# Matrix notifications +# + +# enable/disable Matrix notifications +SEND_MATRIX="YES" + +# The url of the Matrix homeserver +# e.g https://matrix.org:8448 +MATRIX_HOMESERVER= + +# An access token from a valid Matrix account. Tokens usually don't expire, +# can be controlled from a Matrix client. +# See https://matrix.org/docs/guides/client-server.html +MATRIX_ACCESSTOKEN= + +# Specify the default rooms to receive the notification if no rooms are provided +# in a role's recipients. +# The format is !roomid:homeservername +DEFAULT_RECIPIENT_MATRIX="" + #------------------------------------------------------------------------------ # custom notifications # @@ -873,6 +954,14 @@ role_recipients_msteam[sysadmin]="${DEFAULT_RECIPIENT_MSTEAM}" role_recipients_rocketchat[sysadmin]="${DEFAULT_RECIPIENT_ROCKETCHAT}" +role_recipients_dynatrace[sysadmin]="${DEFAULT_RECIPIENT_DYNATRACE}" + +role_recipients_opsgenie[sysadmin]="${DEFAULT_RECIPIENT_OPSGENIE}" + +role_recipients_matrix[sysadmin]="${DEFAULT_RECIPIENT_MATRIX}" + +role_recipients_stackpulse[sysadmin]="${DEFAULT_RECIPIENT_STACKPULSE}" + # ----------------------------------------------------------------------------- # DNS related alarms @@ -922,6 +1011,14 @@ role_recipients_rocketchat[domainadmin]="${DEFAULT_RECIPIENT_ROCKETCHAT}" role_recipients_sms[domainadmin]="${DEFAULT_RECIPIENT_SMS}" +role_recipients_dynatrace[domainadmin]="${DEFAULT_RECIPIENT_DYNATRACE}" + +role_recipients_opsgenie[domainadmin]="${DEFAULT_RECIPIENT_OPSGENIE}" + +role_recipients_matrix[domainadmin]="${DEFAULT_RECIPIENT_MATRIX}" + +role_recipients_stackpulse[domainadmin]="${DEFAULT_RECIPIENT_STACKPULSE}" + # ----------------------------------------------------------------------------- # database servers alarms # mysql, redis, memcached, postgres, etc @@ -972,6 +1069,14 @@ role_recipients_rocketchat[dba]="${DEFAULT_RECIPIENT_ROCKETCHAT}" role_recipients_sms[dba]="${DEFAULT_RECIPIENT_SMS}" +role_recipients_dynatrace[dba]="${DEFAULT_RECIPIENT_DYNATRACE}" + +role_recipients_opsgenie[dba]="${DEFAULT_RECIPIENT_OPSGENIE}" + +role_recipients_matrix[dba]="${DEFAULT_RECIPIENT_MATRIX}" + +role_recipients_stackpulse[dba]="${DEFAULT_RECIPIENT_STACKPULSE}" + # ----------------------------------------------------------------------------- # web servers alarms # apache, nginx, lighttpd, etc @@ -1022,6 +1127,14 @@ role_recipients_rocketchat[webmaster]="${DEFAULT_RECIPIENT_ROCKETCHAT}" role_recipients_sms[webmaster]="${DEFAULT_RECIPIENT_SMS}" +role_recipients_dynatrace[webmaster]="${DEFAULT_RECIPIENT_DYNATRACE}" + +role_recipients_opsgenie[webmaster]="${DEFAULT_RECIPIENT_OPSGENIE}" + +role_recipients_matrix[webmaster]="${DEFAULT_RECIPIENT_MATRIX}" + +role_recipients_stackpulse[webmaster]="${DEFAULT_RECIPIENT_STACKPULSE}" + # ----------------------------------------------------------------------------- # proxy servers alarms # squid, etc @@ -1072,6 +1185,13 @@ role_recipients_rocketchat[proxyadmin]="${DEFAULT_RECIPIENT_ROCKETCHAT}" role_recipients_sms[proxyadmin]="${DEFAULT_RECIPIENT_SMS}" +role_recipients_dynatrace[proxyadmin]="${DEFAULT_RECIPIENT_DYNATRACE}" + +role_recipients_opsgenie[proxyadmin]="${DEFAULT_RECIPIENT_OPSGENIE}" + +role_recipients_matrix[proxyadmin]="${DEFAULT_RECIPIENT_MATRIX}" + +role_recipients_stackpulse[proxyadmin]="${DEFAULT_RECIPIENT_STACKPULSE}" # ----------------------------------------------------------------------------- # peripheral devices @@ -1121,3 +1241,10 @@ role_recipients_rocketchat[sitemgr]="${DEFAULT_RECIPIENT_ROCKETCHAT}" role_recipients_sms[sitemgr]="${DEFAULT_RECIPIENT_SMS}" +role_recipients_dynatrace[sitemgr]="${DEFAULT_RECIPIENT_DYNATRACE}" + +role_recipients_opsgenie[sitemgr]="${DEFAULT_RECIPIENT_OPSGENIE}" + +role_recipients_matrix[sitemgr]="${DEFAULT_RECIPIENT_MATRIX}" + +role_recipients_stackpulse[sitemgr]="${DEFAULT_RECIPIENT_STACKPULSE}" diff --git a/health/notifications/irc/README.md b/health/notifications/irc/README.md index 36590eb79..e7f22e1fd 100644 --- a/health/notifications/irc/README.md +++ b/health/notifications/irc/README.md @@ -1,3 +1,8 @@ + + # IRC This is what you will get: @@ -24,7 +29,7 @@ Set the path for `nc` in `/etc/netdata/health_alarm_notify.conf` (to edit it on nc="/usr/bin/nc" ``` -2. Αn `IRC_NETWORK` to which your preffered channels belong to. +2. Αn `IRC_NETWORK` to which your preferred channels belong to. 3. One or more channels ( `DEFAULT_RECIPIENT_IRC` ) to post the messages to. 4. An `IRC_NICKNAME` and an `IRC_REALNAME` to identify in IRC. diff --git a/health/notifications/kavenegar/README.md b/health/notifications/kavenegar/README.md index 495b5338e..b59799fc2 100644 --- a/health/notifications/kavenegar/README.md +++ b/health/notifications/kavenegar/README.md @@ -1,6 +1,11 @@ + + # Kavenegar -[Kavenegar](https://www.kavenegar.com/) as service for software developers, based in Iran, provides send and receive SMS, calling voice by using its APIs. +[Kavenegar](https://kavenegar.com/) as service for software developers, based in Iran, provides send and receive SMS, calling voice by using its APIs. Will look like this on your Android device: @@ -9,7 +14,7 @@ Will look like this on your Android device: You will need: 1. Signup and Login to kavenegar.com -2. Get your APIKEY and Sender from +2. Get your APIKEY and Sender from `http://panel.kavenegar.com/client/setting/account` 3. Fill in KAVENEGAR_API_KEY="" KAVENEGAR_SENDER="" 4. Add the recipient phone numbers to DEFAULT_RECIPIENT_KAVENEGAR="" diff --git a/health/notifications/matrix/Makefile.inc b/health/notifications/matrix/Makefile.inc new file mode 100644 index 000000000..9937d80c3 --- /dev/null +++ b/health/notifications/matrix/Makefile.inc @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +# THIS IS NOT A COMPLETE Makefile +# IT IS INCLUDED BY ITS PARENT'S Makefile.am +# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT + +# install these files +dist_noinst_DATA += \ + matrix/README.md \ + matrix/Makefile.inc \ + $(NULL) + diff --git a/health/notifications/matrix/README.md b/health/notifications/matrix/README.md new file mode 100644 index 000000000..ea22b4a8a --- /dev/null +++ b/health/notifications/matrix/README.md @@ -0,0 +1,58 @@ + + +# Matrix + +Send notifications to [Matrix](https://matrix.org/) network rooms. + +The requirements for this notification method are: + +1. The url of the homeserver (`https://homeserver:port`). +2. Credentials for connecting to the homeserver, in the form of a valid access token for your account (or for a + dedicated notification account). These tokens usually don't expire. +3. The room ids that you want to sent the notification to. + +To obtain the access token, you can use the following `curl` command: + +```bash +curl -XPOST -d '{"type":"m.login.password", "user":"example", "password":"wordpass"}' "https://homeserver:8448/_matrix/client/r0/login" +``` + +The room ids are unique identifiers and can be obtained from the room settings in a Matrix client (e.g. Riot). Their +format is `!uniqueid:homeserver`. + +Multiple room ids can be defined by separating with a space character. + +Detailed information about the Matrix client API is available at the [official +site](https://matrix.org/docs/guides/client-server.html). + +Your `health_alarm_notify.conf` should look like this: + +```conf +############################################################################### +# Matrix notifications +# + +# enable/disable Matrix notifications +SEND_MATRIX="YES" + +# The url of the Matrix homeserver +# e.g https://matrix.org:8448 +MATRIX_HOMESERVER="https://matrix.org:8448" + +# A access token from a valid Matrix account. Tokens usually don't expire, +# can be controlled from a Matrix client. +# See https://matrix.org/docs/guides/client-server.html +MATRIX_ACCESSTOKEN="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" + +# Specify the default rooms to receive the notification if no rooms are provided +# in a role's recipients. +# The format is !roomid:homeservername +DEFAULT_RECIPIENT_MATRIX="!XXXXXXXXXXXX:matrix.org" +``` + +[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2Fnotifications%2Fmatrix%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/health/notifications/messagebird/README.md b/health/notifications/messagebird/README.md index 30d52a4f2..8e3d1a5b3 100644 --- a/health/notifications/messagebird/README.md +++ b/health/notifications/messagebird/README.md @@ -1,3 +1,8 @@ + + # Messagebird The messagebird notifications will look like this on your Android device: diff --git a/health/notifications/opsgenie/Makefile.inc b/health/notifications/opsgenie/Makefile.inc new file mode 100644 index 000000000..c85bb7c32 --- /dev/null +++ b/health/notifications/opsgenie/Makefile.inc @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +# THIS IS NOT A COMPLETE Makefile +# IT IS INCLUDED BY ITS PARENT'S Makefile.am +# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT + +# install these files +dist_noinst_DATA += \ + opsgenie/README.md \ + opsgenie/Makefile.inc \ + $(NULL) + diff --git a/health/notifications/opsgenie/README.md b/health/notifications/opsgenie/README.md new file mode 100644 index 000000000..aeb315489 --- /dev/null +++ b/health/notifications/opsgenie/README.md @@ -0,0 +1,59 @@ + + +# Send notifications to Opsgenie + +[Opsgenie](https://www.atlassian.com/software/opsgenie) is an alerting and incident response tool. It is designed to +group and filter alarms, build custom routing rules for on-call teams, and correlate deployments and commits to +incidents. + +The first step is to create a [Netdata integration](https://docs.opsgenie.com/docs/api-integration) in the +[Opsgenie](https://www.atlassian.com/software/opsgenie) dashboard. After this, you need to edit +`health_alarm_notify.conf` on your system, by running the following from your [config +directory](/docs/configure/nodes.md): + +```bash +./edit-config health_alarm_notify.conf +``` + +Change the variable `OPSGENIE_API_KEY` with the API key you got from Opsgenie. + +``` +SEND_OPSGENIE="YES" + +# Api key +# Default Opsgenie APi +OPSGENIE_API_KEY="11111111-2222-3333-4444-555555555555" +``` + +Changes to `health_alarm_notify.conf` do not require a Netdata restart. You can test your Opsgenie notifications +configuration by issuing the commands, replacing `ROLE` with your preferred role: + +```sh +# become user netdata +sudo su -s /bin/bash netdata + +# send a test alarm +/usr/libexec/netdata/plugins.d/alarm-notify.sh test ROLE +``` + +If everything works, you'll see alarms in your Opsgenie platform: + +![Example alarm notifications in +Opsgenie](https://user-images.githubusercontent.com/49162938/92184518-f725f900-ee40-11ea-9afa-e7c639c72206.png) + +If sending the test notifications fails, you can look in `/var/log/netdata/error.log` to find the relevant error +message: + +```log +2020-09-03 23:07:00: alarm-notify.sh: ERROR: failed to send opsgenie notification for: hades test.chart.test_alarm is CRITICAL, with HTTP error code 401. +``` + +You can find more details about the Opsgenie error codes in their [response +docs](https://docs.opsgenie.com/docs/response). + +[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2Fnotifications%2Fopsgenie%2FREADME%2FDonations-netdata-has-received&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/health/notifications/pagerduty/README.md b/health/notifications/pagerduty/README.md index 59a48515c..b1f60d495 100644 --- a/health/notifications/pagerduty/README.md +++ b/health/notifications/pagerduty/README.md @@ -1,3 +1,8 @@ + + # PagerDuty [PagerDuty](https://www.pagerduty.com/company/) is the enterprise incident resolution service that integrates with ITOps and DevOps monitoring stacks to improve operational reliability and agility. From enriching and aggregating events to correlating them into incidents, PagerDuty streamlines the incident management process by reducing alert noise and resolution times. @@ -32,6 +37,10 @@ SEND_PD="YES" # the "General API" pagerduty.com service that uses this service key. # (empty = do not send a notification for unconfigured roles): DEFAULT_RECIPIENT_PD="" + +# Which PD API are we going to use? For version 2 or newer, it is necessary to do a request for Pagerduty +# before to set the version(https://developer.pagerduty.com/docs/events-api-v2/overview/). +USE_PD_VERSION="1" ``` [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2Fnotifications%2Fpagerduty%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/health/notifications/prowl/Makefile.inc b/health/notifications/prowl/Makefile.inc new file mode 100644 index 000000000..64a1deb65 --- /dev/null +++ b/health/notifications/prowl/Makefile.inc @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +# THIS IS NOT A COMPLETE Makefile +# IT IS INCLUDED BY ITS PARENT'S Makefile.am +# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT + +# install these files +dist_noinst_DATA += \ + prowl/README.md \ + prowl/Makefile.inc \ + $(NULL) + diff --git a/health/notifications/prowl/README.md b/health/notifications/prowl/README.md new file mode 100644 index 000000000..7c60de270 --- /dev/null +++ b/health/notifications/prowl/README.md @@ -0,0 +1,29 @@ + + +# Prowl + +[Prowl](https://www.prowlapp.com/) is a push notification service for iOS devices. Netdata +supports delivering notifications to iOS devices through Prowl. + +Because of how Netdata integrates with Prowl, there is a hard limit of +at most 1000 notifications per hour (starting from the first notification +sent). Any alerts beyond the first thousand in an hour will be dropped. + +Warning messages will be sent with the 'High' priority, critical messages +will be sent with the 'Emergency' priority, and all other messages will +be sent with the normal priority. Opening the notification's associated +URL will take you to the Netdata dashboard of the system that issued +the alert, directly to the chart that it triggered on. + +## configuration + +To use this, you will need a Prowl API key, which can be requested through +the Prowl website after registering. + +Once you have an API key, simply specify that as a recipient for Prowl +notifications. + +[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2Fnotifications%2Fprowl%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/health/notifications/pushbullet/README.md b/health/notifications/pushbullet/README.md index f5673eca9..7a098d6a0 100644 --- a/health/notifications/pushbullet/README.md +++ b/health/notifications/pushbullet/README.md @@ -1,3 +1,8 @@ + + # PushBullet Will look like this on your browser: diff --git a/health/notifications/pushover/README.md b/health/notifications/pushover/README.md index 2d488d1a8..3ba97fbc3 100644 --- a/health/notifications/pushover/README.md +++ b/health/notifications/pushover/README.md @@ -1,3 +1,8 @@ + + # PushOver pushover.net allows you to receive push notifications on your mobile phone. The service seems free for up to 7.500 messages per month. diff --git a/health/notifications/rocketchat/README.md b/health/notifications/rocketchat/README.md index 47ac5e3f1..a54f5826c 100644 --- a/health/notifications/rocketchat/README.md +++ b/health/notifications/rocketchat/README.md @@ -1,3 +1,8 @@ + + # Rocket.Chat This is what you will get: @@ -42,6 +47,6 @@ role_recipients_rocketchat[webmaster]="marketing development" ``` The keywords `systems`, `databases`, `marketing`, `development` are RocketChat channels (they should already exist). -Both public and private channels can be used, even if they differ from the channel configured in yout RocketChat incomming webhook. +Both public and private channels can be used, even if they differ from the channel configured in your RocketChat incoming webhook. [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2Fnotifications%2Frocketchat%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/health/notifications/slack/README.md b/health/notifications/slack/README.md index 2352d27c8..e338e9af0 100644 --- a/health/notifications/slack/README.md +++ b/health/notifications/slack/README.md @@ -1,3 +1,8 @@ + + # Slack This is what you will get: diff --git a/health/notifications/smstools3/README.md b/health/notifications/smstools3/README.md index 28184b3c4..6d90e702a 100644 --- a/health/notifications/smstools3/README.md +++ b/health/notifications/smstools3/README.md @@ -1,3 +1,8 @@ + + # SMS Server Tools 3 The [SMS Server Tools 3](http://smstools3.kekekasvi.com/) is a SMS Gateway software which can send and receive short messages through GSM modems and mobile phones. diff --git a/health/notifications/stackpulse/Makefile.inc b/health/notifications/stackpulse/Makefile.inc new file mode 100644 index 000000000..eabcb4bcf --- /dev/null +++ b/health/notifications/stackpulse/Makefile.inc @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +# THIS IS NOT A COMPLETE Makefile +# IT IS INCLUDED BY ITS PARENT'S Makefile.am +# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT + +# install these files +dist_noinst_DATA += \ + stackpulse/README.md \ + stackpulse/Makefile.inc \ + $(NULL) + diff --git a/health/notifications/stackpulse/README.md b/health/notifications/stackpulse/README.md new file mode 100644 index 000000000..13d2f7235 --- /dev/null +++ b/health/notifications/stackpulse/README.md @@ -0,0 +1,80 @@ + + +# Send notifications to StackPulse + +[StackPulse](https://stackpulse.com/) is a software-as-a-service platform for site reliability engineering. +It helps SREs, DevOps Engineers and Software Developers reduce toil and alert fatigue while improving reliability of +software services by managing, analyzing and automating incident response activities. + +Sending Netdata alarm notifications to StackPulse allows you to create smart automated response workflows +(StackPulse playbooks) that will help you drive down your MTTD and MTTR by performing any of the following: + +- Enriching the incident with data from multiple sources +- Performing triage actions and analyzing their results +- Orchestrating incident management and notification flows +- Performing automatic and semi-automatic remediation actions +- Analyzing incident data and remediation patterns to improve reliability of your services + +To send the notification you need: + +1. Create a Netdata integration in the `StackPulse Administration Portal`, and copy the `Endpoint` URL. + +![Creating a Netdata integration in StackPulse](https://user-images.githubusercontent.com/49162938/93023348-d9455a80-f5dd-11ea-8e05-67d07dce93e4.png) + +2. On your node, navigate to `/etc/netdata/` and run the following command: + +```sh +$ ./edit-config health_alarm_notify.conf +``` + +3. Set the `STACKPULSE_WEBHOOK` variable to `Endpoint` URL you copied earlier: + +``` +SEND_STACKPULSE="YES" +STACKPULSE_WEBHOOK="https://hooks.stackpulse.io/v1/webhooks/YOUR_UNIQUE_ID" +``` + +4. Now [restart Netdata](/docs/getting-started.md#start-stop-and-restart-netdata). When your node creates an alarm, you + can see the associated notification on your StackPulse Administration Portal + +## React to alarms with playbooks + +StackPulse allow users to create `Playbooks` giving additional information about events that happen in specific +scenarios. For example, you could create a Playbook that responds to a "low disk space" alarm by compressing and +cleaning up storage partitions with dynamic data. + +![image](https://user-images.githubusercontent.com/49162938/93207961-4c201400-f74b-11ea-94d1-42a29d007b62.png) + +![The StackPulse Administration Portal with a Netdata +alarm](https://user-images.githubusercontent.com/49162938/93208199-bfc22100-f74b-11ea-83c4-728be23dcf4d.png) +### Create Playbooks for Netdata alarms + +To create a Playbook, you need to access the StackPulse Administration Portal. After the initial setup, you need to +access the **TRIGGER** tab to define the scenarios used to trigger the event. The following variables are available: + +- `Hostname`: The host that generated the event. +- `Chart`: The name of the chart. +- `OldValue` : The previous value of the alarm. +- `Value`: The current value of the alarm. +- `Units` : The units of the value. +- `OldStatus` : The previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL. +- `State`: The current alarm status, the acceptable values are the same of `OldStatus`. +- `Alarm` : The name of the alarm, as given in Netdata's health.d entries. +- `Date` : The timestamp this event occurred. +- `Duration` : The duration in seconds of the previous alarm state. +- `NonClearDuration` : The total duration in seconds this is/was non-clear. +- `Description` : A short description of the alarm copied from the alarm definition. +- `CalcExpression` : The expression that was evaluated to trigger the alarm. +- `CalcParamValues` : The values of the parameters in the expression, at the time of the evaluation. +- `TotalWarnings` : Total number of alarms in WARNING state. +- `TotalCritical` : Total number of alarms in CRITICAL state. +- `ID` : The unique id of the alarm that generated this event. + +For more details how to create a scenario, take a look at the [StackPulse documentation](https://docs.stackpulse.io). + +[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2Fnotifications%2Fopsgenie%2FREADME%2FDonations-netdata-has-received&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/health/notifications/syslog/README.md b/health/notifications/syslog/README.md index 5f1d5d8be..456394d2f 100644 --- a/health/notifications/syslog/README.md +++ b/health/notifications/syslog/README.md @@ -1,3 +1,8 @@ + + # Syslog You need a working `logger` command for this to work. This is the case on pretty much every Linux system in existence, and most BSD systems. diff --git a/health/notifications/telegram/README.md b/health/notifications/telegram/README.md index 45a2beaa8..c1c6f2a4e 100644 --- a/health/notifications/telegram/README.md +++ b/health/notifications/telegram/README.md @@ -1,3 +1,8 @@ + + # Telegram [Telegram](https://telegram.org/) is a messaging app with a focus on speed and security, it’s super-fast, simple and free. You can use Telegram on all your devices at the same time — your messages sync seamlessly across any number of your phones, tablets or computers. @@ -12,7 +17,7 @@ You need to: 2. Start a conversation with your bot or invite it into a group where you want it to send messages. 3. Find the chat ID for every chat you want to send messages to. Contact the [@myidbot](https://t.me/myidbot) bot and send the `/getid` command to get your personal chat ID or invite it into a group and use the `/getgroupid` command to get the group chat ID. Group IDs start with a hyphen, supergroup IDs start with `-100`. Alternatively, you can get the chat ID directly from the bot API. Send *your* bot a command in the chat you want to use, then check `https://api.telegram.org/bot{YourBotToken}/getUpdates`, eg. `https://api.telegram.org/bot111122223:7OpFlFFRzRBbrUUmIjj5HF9Ox2pYJZy5/getUpdates` - +4. Set the bot token and the chat ID of the recipient in `/etc/netdata/health_alarm_notify.conf` (to edit it on your system run `/etc/netdata/edit-config health_alarm_notify.conf`), like this: ``` SEND_TELEGRAM="YES" TELEGRAM_BOT_TOKEN="111122223:7OpFlFFRzRBbrUUmIjj5HF9Ox2pYJZy5" diff --git a/health/notifications/twilio/README.md b/health/notifications/twilio/README.md index 8c8d7cfb8..b36d40b99 100644 --- a/health/notifications/twilio/README.md +++ b/health/notifications/twilio/README.md @@ -1,3 +1,8 @@ + + # Twilio Will look like this on your Android device: diff --git a/health/notifications/web/README.md b/health/notifications/web/README.md index 8eed06bfb..9e4918603 100644 --- a/health/notifications/web/README.md +++ b/health/notifications/web/README.md @@ -1,3 +1,8 @@ + + # Dashboard The Netdata dashboard shows HTML notifications, when it is open. -- cgit v1.2.3