diff options
Diffstat (limited to 'health')
-rw-r--r-- | health/QUICKSTART.md | 4 | ||||
-rw-r--r-- | health/health.c | 126 | ||||
-rw-r--r-- | health/health.d/ram.conf | 18 | ||||
-rw-r--r-- | health/health_config.c | 7 | ||||
-rw-r--r-- | health/health_json.c | 11 | ||||
-rw-r--r-- | health/health_log.c | 6 | ||||
-rw-r--r-- | health/notifications/Makefile.am | 1 | ||||
-rwxr-xr-x | health/notifications/alarm-notify.sh.in | 69 | ||||
-rw-r--r-- | health/notifications/gotify/Makefile.inc | 11 | ||||
-rw-r--r-- | health/notifications/gotify/README.md | 62 | ||||
-rwxr-xr-x | health/notifications/health_alarm_notify.conf | 22 |
11 files changed, 264 insertions, 73 deletions
diff --git a/health/QUICKSTART.md b/health/QUICKSTART.md index 5cf6929dc..bc2da2df1 100644 --- a/health/QUICKSTART.md +++ b/health/QUICKSTART.md @@ -41,9 +41,9 @@ address or hostname for your Agent dashboard, looking for the `stock health conf here will show the correct path for your installation. ```conf -[health] +[directories] ... - # stock health configuration directory = /usr/lib/netdata/conf.d/health.d + # stock health config = /usr/lib/netdata/conf.d/health.d ``` Navigate to the health configuration directory to see all the available files and open them for reading. diff --git a/health/health.c b/health/health.c index 528238d74..3c1e5693e 100644 --- a/health/health.c +++ b/health/health.c @@ -58,7 +58,7 @@ static inline void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae) inline char *health_user_config_dir(void) { char buffer[FILENAME_MAX + 1]; snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_user_config_dir); - return config_get(CONFIG_SECTION_HEALTH, "health configuration directory", buffer); + return config_get(CONFIG_SECTION_DIRECTORIES, "health config", buffer); } /** @@ -71,7 +71,7 @@ inline char *health_user_config_dir(void) { inline char *health_stock_config_dir(void) { char buffer[FILENAME_MAX + 1]; snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_stock_config_dir); - return config_get(CONFIG_SECTION_HEALTH, "stock health configuration directory", buffer); + return config_get(CONFIG_SECTION_DIRECTORIES, "stock health config", buffer); } /** @@ -354,7 +354,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN"); - snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s'", + snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'", exec, recipient, host->registry_hostname, @@ -383,7 +383,8 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { buffer_tostring(warn_alarms), buffer_tostring(crit_alarms), ae->classification?ae->classification:"Unknown", - edit_command + edit_command, + host != localhost ? host->machine_guid:"" ); ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN; @@ -453,9 +454,11 @@ static inline void health_alarm_log_process(RRDHOST *host) { // remember this for the next iteration host->health_last_processed_id = first_waiting; + bool cleanup_excess_log_entries = host->health_log.count > host->health_log.max; + netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); - if(host->health_log.count <= host->health_log.max) + if (!cleanup_excess_log_entries) return; // cleanup excess entries in the log @@ -514,11 +517,6 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) return 0; } - if(unlikely(!rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ENABLED))) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart is not enabled", rc->chart?rc->chart:"NOCHART", rc->name); - return 0; - } - if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) { debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rc->chart?rc->chart:"NOCHART", rc->name); return 0; @@ -576,6 +574,8 @@ static inline int check_if_resumed_from_suspension(void) { } static void health_main_cleanup(void *ptr) { + worker_unregister(); + struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; @@ -658,35 +658,34 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { // Create alarms for dimensions that have been added to charts // since the previous iteration. static void init_pending_foreach_alarms(RRDHOST *host) { - rrdhost_wrlock(host); + RRDSET *st; + RRDDIM *rd; - if (host->alarms_with_foreach || host->alarms_template_with_foreach) { - if (rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS)) { - RRDSET *st; + if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS)) + return; - rrdset_foreach_read(st, host) { - rrdset_wrlock(st); + rrdhost_wrlock(host); - if (rrdset_flag_check(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS)) { - RRDDIM *rd; + rrdset_foreach_write(st, host) { + if (!rrdset_flag_check(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS)) + continue; - rrddim_foreach_write(rd, st) { - if (rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM)) { - rrdcalc_link_to_rrddim(rd, st, host); - rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM); - } - } + rrdset_rdlock(st); - rrdset_flag_clear(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS); - } + rrddim_foreach_read(rd, st) { + if (!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM)) + continue; - rrdset_unlock(st); - } + rrdcalc_link_to_rrddim(rd, st, host); - rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS); + rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM); } + + rrdset_flag_clear(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS); + rrdset_unlock(st); } + rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS); rrdhost_unlock(host); } @@ -699,7 +698,31 @@ static void init_pending_foreach_alarms(RRDHOST *host) { * * @return It always returns NULL */ + +#define WORKER_HEALTH_JOB_RRD_LOCK 0 +#define WORKER_HEALTH_JOB_HOST_LOCK 1 +#define WORKER_HEALTH_JOB_DB_QUERY 2 +#define WORKER_HEALTH_JOB_CALC_EVAL 3 +#define WORKER_HEALTH_JOB_WARNING_EVAL 4 +#define WORKER_HEALTH_JOB_CRITICAL_EVAL 5 +#define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6 +#define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7 + +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 8 +#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 8 +#endif + void *health_main(void *ptr) { + worker_register("HEALTH"); + worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock"); + worker_register_job_name(WORKER_HEALTH_JOB_HOST_LOCK, "host lock"); + worker_register_job_name(WORKER_HEALTH_JOB_DB_QUERY, "db lookup"); + worker_register_job_name(WORKER_HEALTH_JOB_CALC_EVAL, "calc eval"); + worker_register_job_name(WORKER_HEALTH_JOB_WARNING_EVAL, "warning eval"); + worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval"); + worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry"); + worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process"); + netdata_thread_cleanup_push(health_main_cleanup, ptr); int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10); @@ -747,6 +770,7 @@ void *health_main(void *ptr) { marked_aclk_reload_loop = loop; #endif + worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK); rrd_rdlock(); RRDHOST *host; @@ -776,6 +800,7 @@ void *health_main(void *ptr) { init_pending_foreach_alarms(host); + worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK); rrdhost_rdlock(host); // the first loop is to lookup values from the db @@ -790,6 +815,7 @@ void *health_main(void *ptr) { rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) && now > (rc->rrdset->last_collected_time.tv_sec + 60))) { if (!rrdcalc_isrepeating(rc)) { + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); time_t now = now_realtime_sec(); ALARM_ENTRY *ae = health_create_alarm_entry( host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, @@ -804,11 +830,10 @@ void *health_main(void *ptr) { rc->value = NAN; #if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL) if (netdata_cloud_setting && likely(!aclk_alert_reloaded)) - sql_queue_removed_alerts_to_aclk(host); + sql_queue_alarm_to_aclk(host, ae, 1); #endif } } - continue; } if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) { @@ -825,6 +850,8 @@ void *health_main(void *ptr) { // if there is database lookup, do it if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) { + worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY); + /* time_t old_db_timestamp = rc->db_before; */ int value_is_null = 0; @@ -881,6 +908,8 @@ void *health_main(void *ptr) { // if there is calculation expression, run it if (unlikely(rc->calculation)) { + worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL); + if (unlikely(!expression_evaluate(rc->calculation))) { // calculation failed rc->value = NAN; @@ -929,6 +958,8 @@ void *health_main(void *ptr) { // check the warning expression if (likely(rc->warning)) { + worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL); + if (unlikely(!expression_evaluate(rc->warning))) { // calculation failed rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR; @@ -953,6 +984,8 @@ void *health_main(void *ptr) { // check the critical expression if (likely(rc->critical)) { + worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL); + if (unlikely(!expression_evaluate(rc->critical))) { // calculation failed rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR; @@ -1010,6 +1043,7 @@ void *health_main(void *ptr) { // check if the new status and the old differ if (status != rc->status) { + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); int delay = 0; // apply trigger hysteresis @@ -1041,19 +1075,19 @@ void *health_main(void *ptr) { rc->delay_last = delay; rc->delay_up_to_timestamp = now + delay; - if(likely(!rrdcalc_isrepeating(rc))) { - ALARM_ENTRY *ae = health_create_alarm_entry( - host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, - rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, - rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, - rc->delay_last, - ( - ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | - ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) - ) - ); - health_alarm_log(host, ae); - } + + ALARM_ENTRY *ae = health_create_alarm_entry( + host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, + rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, + rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, + rc->delay_last, + ( + ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | + ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) + ) + ); + health_alarm_log(host, ae); + rc->last_status_change = now; rc->old_status = rc->status; rc->status = status; @@ -1091,7 +1125,9 @@ void *health_main(void *ptr) { } if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) { + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); rc->last_repeat = now; + if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++; ALARM_ENTRY *ae = health_create_alarm_entry( host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, @@ -1122,6 +1158,7 @@ void *health_main(void *ptr) { // execute notifications // and cleanup + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS); health_alarm_log_process(host); if (unlikely(netdata_exit)) { @@ -1160,6 +1197,7 @@ void *health_main(void *ptr) { now = now_realtime_sec(); if(now < next_run) { + worker_is_idle(); debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now)); sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now)); now = now_realtime_sec(); diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf index 6e6e3b400..ff5f3ac17 100644 --- a/health/health.d/ram.conf +++ b/health/health.d/ram.conf @@ -1,18 +1,6 @@ # you can disable an alarm notification by setting the 'to' line to: silent - alarm: used_ram_to_ignore - on: system.ram - class: Utilization - type: System -component: Memory - os: linux freebsd - hosts: * - calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz - $zfs.arc_size.min) - every: 10s - info: amount of memory reported as used, \ - but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC) - alarm: ram_in_use on: system.ram class: Utilization @@ -20,7 +8,7 @@ component: Memory component: Memory os: linux hosts: * - calc: ($used - $used_ram_to_ignore) * 100 / ($used + $cached + $free + $buffers) + calc: $used * 100 / ($used + $cached + $free + $buffers) units: % every: 10s warn: $this > (($status >= $WARNING) ? (80) : (90)) @@ -66,7 +54,7 @@ host labels: _is_k8s_node = false component: Memory os: freebsd hosts: * - calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive) + calc: ($active + $wired + $laundry + $buffers) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive) units: % every: 10s warn: $this > (($status >= $WARNING) ? (80) : (90)) @@ -82,7 +70,7 @@ component: Memory component: Memory os: freebsd hosts: * - calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers) + calc: ($free + $inactive + $cache) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers) units: % every: 10s warn: $this < (($status >= $WARNING) ? (15) : (10)) diff --git a/health/health_config.c b/health/health_config.c index e1f5f0e31..df6d7b609 100644 --- a/health/health_config.c +++ b/health/health_config.c @@ -109,7 +109,7 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL && !strcmp(t->name, rt->name) && !strcmp(t->family_match?t->family_match:"*", rt->family_match?rt->family_match:"*") )) { - error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname); + info("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname); return 0; } } @@ -127,7 +127,7 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL && !strcmp(t->name, rt->name) && !strcmp(t->family_match?t->family_match:"*", rt->family_match?rt->family_match:"*") )) { - error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname); + info("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname); return 0; } } @@ -433,6 +433,9 @@ static inline int health_parse_db_lookup( else if(!strcasecmp(key, "unaligned")) { *options |= RRDR_OPTION_NOT_ALIGNED; } + else if(!strcasecmp(key, "anomaly-bit")) { + *options |= RRDR_OPTION_ANOMALY_BIT; + } else if(!strcasecmp(key, "match-ids") || !strcasecmp(key, "match_ids")) { *options |= RRDR_OPTION_MATCH_IDS; } diff --git a/health/health_json.c b/health/health_json.c index be95100bc..d5285c11e 100644 --- a/health/health_json.c +++ b/health/health_json.c @@ -165,6 +165,10 @@ static inline void health_rrdcalc_values2json_nolock(RRDHOST *host, BUFFER *wb, buffer_rrd_value(wb, rc->value); buffer_strcat(wb, ",\n"); + buffer_strcat(wb, "\t\t\t\"last_updated\":"); + buffer_sprintf(wb, "%lu", (unsigned long)rc->last_updated); + buffer_strcat(wb, ",\n"); + buffer_sprintf(wb, "\t\t\t\"status\": \"%s\"\n" , rrdcalc_status2string(rc->status)); @@ -227,6 +231,7 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC "\t\t\t\"crit_repeat_every\": \"%u\",\n" "\t\t\t\"value_string\": \"%s\",\n" "\t\t\t\"last_repeat\": \"%lu\",\n" + "\t\t\t\"times_repeat\": %lu,\n" , rc->chart, rc->name , (unsigned long)rc->id , hash_id @@ -259,6 +264,7 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC , rc->crit_repeat_every , value_string , (unsigned long)rc->last_repeat + , (unsigned long)rc->times_repeat ); if(unlikely(rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)) { @@ -338,6 +344,8 @@ void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* contexts, RRDCAL for(rc = host->alarms; rc ; rc = rc->next) { if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) continue; + if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset))) + continue; if(unlikely(rc->rrdset && rc->rrdset->hash_context == simple_hash(tok) && !strcmp(rc->rrdset->context, tok) && ((status==RRDCALC_STATUS_RAISED)?(rc->status >= RRDCALC_STATUS_WARNING):rc->status == status))) @@ -349,7 +357,8 @@ void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* contexts, RRDCAL for(rc = host->alarms; rc ; rc = rc->next) { if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) continue; - + if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset))) + continue; if(unlikely((status==RRDCALC_STATUS_RAISED)?(rc->status >= RRDCALC_STATUS_WARNING):rc->status == status)) numberOfAlarms++; } diff --git a/health/health_log.c b/health/health_log.c index 6d63966c7..54f6dc9fc 100644 --- a/health/health_log.c +++ b/health/health_log.c @@ -162,7 +162,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { #ifdef ENABLE_ACLK if (netdata_cloud_setting) { - sql_queue_alarm_to_aclk(host, ae); + sql_queue_alarm_to_aclk(host, ae, 0); } #endif } @@ -560,10 +560,6 @@ inline void health_alarm_log( ) { debug(D_HEALTH, "Health adding alarm log entry with id: %u", ae->unique_id); - if(unlikely(alarm_entry_isrepeating(host, ae))) { - error("Repeating alarms cannot be added to host's alarm log entries. It seems somewhere in the logic, API is being misused. Alarm id: %u", ae->alarm_id); - return; - } // link it netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock); ae->next = host->health_log.alarms; diff --git a/health/notifications/Makefile.am b/health/notifications/Makefile.am index 46a6e472c..f026171a7 100644 --- a/health/notifications/Makefile.am +++ b/health/notifications/Makefile.am @@ -31,6 +31,7 @@ include awssns/Makefile.inc include discord/Makefile.inc include email/Makefile.inc include flock/Makefile.inc +include gotify/Makefile.inc include hangouts/Makefile.inc include irc/Makefile.inc include kavenegar/Makefile.inc diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in index 287cabfef..38a69a0f3 100755 --- a/health/notifications/alarm-notify.sh.in +++ b/health/notifications/alarm-notify.sh.in @@ -38,6 +38,7 @@ # - Dynatrace Event by @illumine # - Stackpulse Event by @thiagoftsm # - Opsgenie by @thiaoftsm #9858 +# - Gotify by @coffeegrind123 # ----------------------------------------------------------------------------- # testing notifications @@ -243,6 +244,7 @@ else total_crit_alarms="${26}" # List of alarms in critical state classification="${27}" # The class field from .conf files edit_command_line="${28}" # The command to edit the alarm, with the line number + child_machine_guid="${29}" # If populated, the notification is sent for a child fi # ----------------------------------------------------------------------------- @@ -400,6 +402,10 @@ SEND_DYNATRACE= # stackpulse configs STACKPULSE_WEBHOOK= +# gotify configs +GOTIFY_APP_URL= +GOTIFY_APP_TOKEN= + # opsgenie configs OPSGENIE_API_KEY= @@ -589,6 +595,9 @@ filter_recipient_by_criticality() { # check matrix { [ -z "${MATRIX_HOMESERVER}" ] || [ -z "${MATRIX_ACCESSTOKEN}" ]; } && SEND_MATRIX="NO" +# check gotify +{ [ -z "${GOTIFY_APP_TOKEN}" ] || [ -z "${GOTIFY_APP_URL}" ]; } && SEND_GOTIFY="NO" + # check stackpulse [ -z "${STACKPULSE_WEBHOOK}" ] && SEND_STACKPULSE="NO" @@ -626,7 +635,8 @@ if [ "${SEND_PUSHOVER}" = "YES" ] || [ "${SEND_MSTEAMS}" = "YES" ] || [ "${SEND_DYNATRACE}" = "YES" ] || [ "${SEND_STACKPULSE}" = "YES" ] || - [ "${SEND_OPSGENIE}" = "YES" ]; then + [ "${SEND_OPSGENIE}" = "YES" ] || + [ "${SEND_GOTIFY}" = "YES" ]; then # if we need curl, check for the curl command if [ -z "${curl}" ]; then curl="$(command -v curl 2>/dev/null)" @@ -656,6 +666,7 @@ if [ "${SEND_PUSHOVER}" = "YES" ] || SEND_DYNATRACE="NO" SEND_STACKPULSE="NO" SEND_OPSGENIE="NO" + SEND_GOTIFY="NO" fi fi @@ -795,7 +806,8 @@ for method in "${SEND_EMAIL}" \ "${SEND_MSTEAMS}" \ "${SEND_DYNATRACE}" \ "${SEND_STACKPULSE}" \ - "${SEND_OPSGENIE}" ; do + "${SEND_OPSGENIE}" \ + "${SEND_GOTIFY}" ; do if [ "${method}" == "YES" ]; then proceed=1 @@ -2278,6 +2290,45 @@ EOF } # ----------------------------------------------------------------------------- +# Gotify sender + +send_gotify() { + local payload httpcode priority + [ "${SEND_GOTIFY}" != "YES" ] && return 1 + + if [ -z "${GOTIFY_APP_TOKEN}" ] ; then + info "Can't send Gotify notification, because GOTIFY_APP_TOKEN is not defined" + return 1 + fi + + # priority for Gotify Android app + case "${status}" in + CRITICAL) priority=10 ;; # sound + vibration + WARNING) priority=4 ;; # sound + *) priority=1 ;; # notification only + esac + + payload=$(cat <<EOF + { + "title" : "${status}, ${name} = ${value_string}, on ${host}", + "message" : "${date}: ${chart} ${value_string}", + "priority" : ${priority} + } +EOF +) + + httpcode=$(docurl -X POST -H "Content-Type: application/json" -d "${payload}" "${GOTIFY_APP_URL}/message?token=${GOTIFY_APP_TOKEN}") + if [ "${httpcode}" = "200" ]; then + info "sent gotify notification for: ${host} ${chart}.${name} is ${status}" + else + error "failed to send gotify notification for: ${host} ${chart}.${name} is ${status}, with HTTP error code ${httpcode}." + return 1 + fi + + return 0 +} + +# ----------------------------------------------------------------------------- # prepare the content of the notification # the url to send the user on click @@ -2311,7 +2362,11 @@ if [ ${GOTOCLOUD} -eq 0 ]; then else # Temporarily disable alarm redirection, as the cloud endpoint no longer exists. This functionality will be restored after discussion on #9487. For now, just lead to netdata.cloud # Re-allow alarm redirection, for alarms 2.0, new template - goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentId=${NETDATA_REGISTRY_UNIQUE_ID}&${redirect_params}" + if [ -z "${child_machine_guid}" ]; then + goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentId=${NETDATA_REGISTRY_UNIQUE_ID}&${redirect_params}" + else + goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentId=${NETDATA_REGISTRY_UNIQUE_ID}&childId=${child_machine_guid}&${redirect_params}" + fi fi # the severity of the alarm @@ -3467,6 +3522,11 @@ send_opsgenie SENT_OPSGENIE=$? # ----------------------------------------------------------------------------- +# send messages to Gotify +send_gotify +SENT_GOTIFY=$? + +# ----------------------------------------------------------------------------- # let netdata know for state in "${SENT_EMAIL}" \ "${SENT_PUSHOVER}" \ @@ -3495,7 +3555,8 @@ for state in "${SENT_EMAIL}" \ "${SENT_MSTEAMS}" \ "${SENT_DYNATRACE}" \ "${SENT_STACKPULSE}" \ - "${SENT_OPSGENIE}"; do + "${SENT_OPSGENIE}" \ + "${SENT_GOTIFY}"; do if [ "${state}" -eq 0 ]; then # we sent something exit 0 diff --git a/health/notifications/gotify/Makefile.inc b/health/notifications/gotify/Makefile.inc new file mode 100644 index 000000000..782559125 --- /dev/null +++ b/health/notifications/gotify/Makefile.inc @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +# THIS IS NOT A COMPLETE Makefile +# IT IS INCLUDED BY ITS PARENT'S Makefile.am +# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT + +# install these files +dist_noinst_DATA += \ + gotify/README.md \ + gotify/Makefile.inc \ + $(NULL) diff --git a/health/notifications/gotify/README.md b/health/notifications/gotify/README.md new file mode 100644 index 000000000..c253c845c --- /dev/null +++ b/health/notifications/gotify/README.md @@ -0,0 +1,62 @@ +<!-- +title: "Send notifications to Gotify" +description: "Send alerts to your Gotify instance when an alert gets triggered in Netdata." +sidebar_label: "Gotify" +custom_edit_url: https://github.com/netdata/netdata/edit/master/health/notifications/gotify/README.md +--> + +# Send notifications to Gotify + +[Gotify](https://gotify.net/) is a self-hosted push notification service created for sending and receiving messages in real time. + +## Configuring Gotify + +### Prerequisites + +To use Gotify as your notification service, you need an application token. +You can generate a new token in the Gotify Web UI. + +### Configuration + +To set up Gotify in Netdata: + +1. Switch to your [config +directory](/docs/configure/nodes.md) and edit the file `health_alarm_notify.conf` using the edit config script. + + ```bash + ./edit-config health_alarm_notify.conf + ``` + +2. Change the variable `GOTIFY_APP_TOKEN` to the application token you generated in the Gotify Web UI. Change +`GOTIFY_APP_URL` to point to your Gotify instance. + + ```conf + SEND_GOTIFY="YES" + + # Application token + # Gotify instance url + GOTIFY_APP_TOKEN=XXXXXXXXXXXXXXX + GOTIFY_APP_URL=https://push.example.de/ + ``` + + Changes to `health_alarm_notify.conf` do not require a Netdata restart. + +3. Test your Gotify notifications configuration by running the following commands, replacing `ROLE` with your preferred role: + + ```sh + # become user netdata + sudo su -s /bin/bash netdata + + # send a test alarm + /usr/libexec/netdata/plugins.d/alarm-notify.sh test ROLE + ``` + + 🟢 If everything works, you'll see alarms in Gotify: + + ![Example alarm notifications in Gotify](https://user-images.githubusercontent.com/103264516/162509205-1e88e5d9-96b6-4f7f-9426-182776158128.png) + + 🔴 If sending the test notifications fails, check `/var/log/netdata/error.log` to find the relevant error message: + + ```log + 2020-09-03 23:07:00: alarm-notify.sh: ERROR: failed to send Gotify notification for: hades test.chart.test_alarm is CRITICAL, with HTTP error code 401. + ``` diff --git a/health/notifications/health_alarm_notify.conf b/health/notifications/health_alarm_notify.conf index 873c7c353..b69c6d538 100755 --- a/health/notifications/health_alarm_notify.conf +++ b/health/notifications/health_alarm_notify.conf @@ -279,6 +279,16 @@ STACKPULSE_WEBHOOK="" DEFAULT_RECIPIENT_STACKPULSE="" #------------------------------------------------------------------------------ +# gotify global notification options +SEND_GOTIFY="YES" + +# App token and url +GOTIFY_APP_TOKEN="" +GOTIFY_APP_URL="" + +DEFAULT_RECIPIENT_GOTIFY="" + +#------------------------------------------------------------------------------ # opsgenie global notification options SEND_OPSGENIE="YES" @@ -971,6 +981,8 @@ role_recipients_matrix[sysadmin]="${DEFAULT_RECIPIENT_MATRIX}" role_recipients_stackpulse[sysadmin]="${DEFAULT_RECIPIENT_STACKPULSE}" +role_recipients_gotify[sysadmin]="${DEFAULT_RECIPIENT_GOTIFY}" + # ----------------------------------------------------------------------------- # DNS related alarms @@ -1028,6 +1040,8 @@ role_recipients_matrix[domainadmin]="${DEFAULT_RECIPIENT_MATRIX}" role_recipients_stackpulse[domainadmin]="${DEFAULT_RECIPIENT_STACKPULSE}" +role_recipients_gotify[domainadmin]="${DEFAULT_RECIPIENT_GOTIFY}" + # ----------------------------------------------------------------------------- # database servers alarms # mysql, redis, memcached, postgres, etc @@ -1086,6 +1100,8 @@ role_recipients_matrix[dba]="${DEFAULT_RECIPIENT_MATRIX}" role_recipients_stackpulse[dba]="${DEFAULT_RECIPIENT_STACKPULSE}" +role_recipients_gotify[dba]="${DEFAULT_RECIPIENT_GOTIFY}" + # ----------------------------------------------------------------------------- # web servers alarms # apache, nginx, lighttpd, etc @@ -1144,6 +1160,8 @@ role_recipients_matrix[webmaster]="${DEFAULT_RECIPIENT_MATRIX}" role_recipients_stackpulse[webmaster]="${DEFAULT_RECIPIENT_STACKPULSE}" +role_recipients_gotify[webmaster]="${DEFAULT_RECIPIENT_GOTIFY}" + # ----------------------------------------------------------------------------- # proxy servers alarms # squid, etc @@ -1202,6 +1220,8 @@ role_recipients_matrix[proxyadmin]="${DEFAULT_RECIPIENT_MATRIX}" role_recipients_stackpulse[proxyadmin]="${DEFAULT_RECIPIENT_STACKPULSE}" +role_recipients_gotify[proxyadmin]="${DEFAULT_RECIPIENT_GOTIFY}" + # ----------------------------------------------------------------------------- # peripheral devices # UPS, photovoltaics, etc @@ -1257,3 +1277,5 @@ role_recipients_opsgenie[sitemgr]="${DEFAULT_RECIPIENT_OPSGENIE}" role_recipients_matrix[sitemgr]="${DEFAULT_RECIPIENT_MATRIX}" role_recipients_stackpulse[sitemgr]="${DEFAULT_RECIPIENT_STACKPULSE}" + +role_recipients_gotify[sitemgr]="${DEFAULT_RECIPIENT_GOTIFY}" |