summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2022-06-09 04:52:39 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2022-06-09 04:52:39 +0000
commit89f3604407aff8f4cb2ed958252c61e23c767e24 (patch)
tree7fbf408102cab051557d38193524d8c6e991d070 /health
parentAdding upstream version 1.34.1. (diff)
downloadnetdata-upstream/1.35.0.tar.xz
netdata-upstream/1.35.0.zip
Adding upstream version 1.35.0.upstream/1.35.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'health')
-rw-r--r--health/QUICKSTART.md4
-rw-r--r--health/health.c126
-rw-r--r--health/health.d/ram.conf18
-rw-r--r--health/health_config.c7
-rw-r--r--health/health_json.c11
-rw-r--r--health/health_log.c6
-rw-r--r--health/notifications/Makefile.am1
-rwxr-xr-xhealth/notifications/alarm-notify.sh.in69
-rw-r--r--health/notifications/gotify/Makefile.inc11
-rw-r--r--health/notifications/gotify/README.md62
-rwxr-xr-xhealth/notifications/health_alarm_notify.conf22
11 files changed, 264 insertions, 73 deletions
diff --git a/health/QUICKSTART.md b/health/QUICKSTART.md
index 5cf6929dc..bc2da2df1 100644
--- a/health/QUICKSTART.md
+++ b/health/QUICKSTART.md
@@ -41,9 +41,9 @@ address or hostname for your Agent dashboard, looking for the `stock health conf
here will show the correct path for your installation.
```conf
-[health]
+[directories]
...
- # stock health configuration directory = /usr/lib/netdata/conf.d/health.d
+ # stock health config = /usr/lib/netdata/conf.d/health.d
```
Navigate to the health configuration directory to see all the available files and open them for reading.
diff --git a/health/health.c b/health/health.c
index 528238d74..3c1e5693e 100644
--- a/health/health.c
+++ b/health/health.c
@@ -58,7 +58,7 @@ static inline void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae)
inline char *health_user_config_dir(void) {
char buffer[FILENAME_MAX + 1];
snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_user_config_dir);
- return config_get(CONFIG_SECTION_HEALTH, "health configuration directory", buffer);
+ return config_get(CONFIG_SECTION_DIRECTORIES, "health config", buffer);
}
/**
@@ -71,7 +71,7 @@ inline char *health_user_config_dir(void) {
inline char *health_stock_config_dir(void) {
char buffer[FILENAME_MAX + 1];
snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_stock_config_dir);
- return config_get(CONFIG_SECTION_HEALTH, "stock health configuration directory", buffer);
+ return config_get(CONFIG_SECTION_DIRECTORIES, "stock health config", buffer);
}
/**
@@ -354,7 +354,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN");
- snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s'",
+ snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'",
exec,
recipient,
host->registry_hostname,
@@ -383,7 +383,8 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
buffer_tostring(warn_alarms),
buffer_tostring(crit_alarms),
ae->classification?ae->classification:"Unknown",
- edit_command
+ edit_command,
+ host != localhost ? host->machine_guid:""
);
ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
@@ -453,9 +454,11 @@ static inline void health_alarm_log_process(RRDHOST *host) {
// remember this for the next iteration
host->health_last_processed_id = first_waiting;
+ bool cleanup_excess_log_entries = host->health_log.count > host->health_log.max;
+
netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
- if(host->health_log.count <= host->health_log.max)
+ if (!cleanup_excess_log_entries)
return;
// cleanup excess entries in the log
@@ -514,11 +517,6 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
return 0;
}
- if(unlikely(!rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ENABLED))) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart is not enabled", rc->chart?rc->chart:"NOCHART", rc->name);
- return 0;
- }
-
if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) {
debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rc->chart?rc->chart:"NOCHART", rc->name);
return 0;
@@ -576,6 +574,8 @@ static inline int check_if_resumed_from_suspension(void) {
}
static void health_main_cleanup(void *ptr) {
+ worker_unregister();
+
struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
@@ -658,35 +658,34 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
// Create alarms for dimensions that have been added to charts
// since the previous iteration.
static void init_pending_foreach_alarms(RRDHOST *host) {
- rrdhost_wrlock(host);
+ RRDSET *st;
+ RRDDIM *rd;
- if (host->alarms_with_foreach || host->alarms_template_with_foreach) {
- if (rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS)) {
- RRDSET *st;
+ if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS))
+ return;
- rrdset_foreach_read(st, host) {
- rrdset_wrlock(st);
+ rrdhost_wrlock(host);
- if (rrdset_flag_check(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS)) {
- RRDDIM *rd;
+ rrdset_foreach_write(st, host) {
+ if (!rrdset_flag_check(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS))
+ continue;
- rrddim_foreach_write(rd, st) {
- if (rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM)) {
- rrdcalc_link_to_rrddim(rd, st, host);
- rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM);
- }
- }
+ rrdset_rdlock(st);
- rrdset_flag_clear(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS);
- }
+ rrddim_foreach_read(rd, st) {
+ if (!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM))
+ continue;
- rrdset_unlock(st);
- }
+ rrdcalc_link_to_rrddim(rd, st, host);
- rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS);
+ rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM);
}
+
+ rrdset_flag_clear(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS);
+ rrdset_unlock(st);
}
+ rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS);
rrdhost_unlock(host);
}
@@ -699,7 +698,31 @@ static void init_pending_foreach_alarms(RRDHOST *host) {
*
* @return It always returns NULL
*/
+
+#define WORKER_HEALTH_JOB_RRD_LOCK 0
+#define WORKER_HEALTH_JOB_HOST_LOCK 1
+#define WORKER_HEALTH_JOB_DB_QUERY 2
+#define WORKER_HEALTH_JOB_CALC_EVAL 3
+#define WORKER_HEALTH_JOB_WARNING_EVAL 4
+#define WORKER_HEALTH_JOB_CRITICAL_EVAL 5
+#define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6
+#define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7
+
+#if WORKER_UTILIZATION_MAX_JOB_TYPES < 8
+#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 8
+#endif
+
void *health_main(void *ptr) {
+ worker_register("HEALTH");
+ worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock");
+ worker_register_job_name(WORKER_HEALTH_JOB_HOST_LOCK, "host lock");
+ worker_register_job_name(WORKER_HEALTH_JOB_DB_QUERY, "db lookup");
+ worker_register_job_name(WORKER_HEALTH_JOB_CALC_EVAL, "calc eval");
+ worker_register_job_name(WORKER_HEALTH_JOB_WARNING_EVAL, "warning eval");
+ worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval");
+ worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry");
+ worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process");
+
netdata_thread_cleanup_push(health_main_cleanup, ptr);
int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
@@ -747,6 +770,7 @@ void *health_main(void *ptr) {
marked_aclk_reload_loop = loop;
#endif
+ worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK);
rrd_rdlock();
RRDHOST *host;
@@ -776,6 +800,7 @@ void *health_main(void *ptr) {
init_pending_foreach_alarms(host);
+ worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK);
rrdhost_rdlock(host);
// the first loop is to lookup values from the db
@@ -790,6 +815,7 @@ void *health_main(void *ptr) {
rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) &&
now > (rc->rrdset->last_collected_time.tv_sec + 60))) {
if (!rrdcalc_isrepeating(rc)) {
+ worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
time_t now = now_realtime_sec();
ALARM_ENTRY *ae = health_create_alarm_entry(
host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
@@ -804,11 +830,10 @@ void *health_main(void *ptr) {
rc->value = NAN;
#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
if (netdata_cloud_setting && likely(!aclk_alert_reloaded))
- sql_queue_removed_alerts_to_aclk(host);
+ sql_queue_alarm_to_aclk(host, ae, 1);
#endif
}
}
- continue;
}
if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
@@ -825,6 +850,8 @@ void *health_main(void *ptr) {
// if there is database lookup, do it
if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
+ worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY);
+
/* time_t old_db_timestamp = rc->db_before; */
int value_is_null = 0;
@@ -881,6 +908,8 @@ void *health_main(void *ptr) {
// if there is calculation expression, run it
if (unlikely(rc->calculation)) {
+ worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL);
+
if (unlikely(!expression_evaluate(rc->calculation))) {
// calculation failed
rc->value = NAN;
@@ -929,6 +958,8 @@ void *health_main(void *ptr) {
// check the warning expression
if (likely(rc->warning)) {
+ worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL);
+
if (unlikely(!expression_evaluate(rc->warning))) {
// calculation failed
rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
@@ -953,6 +984,8 @@ void *health_main(void *ptr) {
// check the critical expression
if (likely(rc->critical)) {
+ worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL);
+
if (unlikely(!expression_evaluate(rc->critical))) {
// calculation failed
rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
@@ -1010,6 +1043,7 @@ void *health_main(void *ptr) {
// check if the new status and the old differ
if (status != rc->status) {
+ worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
int delay = 0;
// apply trigger hysteresis
@@ -1041,19 +1075,19 @@ void *health_main(void *ptr) {
rc->delay_last = delay;
rc->delay_up_to_timestamp = now + delay;
- if(likely(!rrdcalc_isrepeating(rc))) {
- ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
- rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
- rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
- rc->delay_last,
- (
- ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
- ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
- )
- );
- health_alarm_log(host, ae);
- }
+
+ ALARM_ENTRY *ae = health_create_alarm_entry(
+ host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
+ rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
+ rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
+ rc->delay_last,
+ (
+ ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
+ ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
+ )
+ );
+ health_alarm_log(host, ae);
+
rc->last_status_change = now;
rc->old_status = rc->status;
rc->status = status;
@@ -1091,7 +1125,9 @@ void *health_main(void *ptr) {
}
if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
+ worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
rc->last_repeat = now;
+ if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
ALARM_ENTRY *ae = health_create_alarm_entry(
host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
@@ -1122,6 +1158,7 @@ void *health_main(void *ptr) {
// execute notifications
// and cleanup
+ worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS);
health_alarm_log_process(host);
if (unlikely(netdata_exit)) {
@@ -1160,6 +1197,7 @@ void *health_main(void *ptr) {
now = now_realtime_sec();
if(now < next_run) {
+ worker_is_idle();
debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
now = now_realtime_sec();
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index 6e6e3b400..ff5f3ac17 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -1,18 +1,6 @@
# you can disable an alarm notification by setting the 'to' line to: silent
- alarm: used_ram_to_ignore
- on: system.ram
- class: Utilization
- type: System
-component: Memory
- os: linux freebsd
- hosts: *
- calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz - $zfs.arc_size.min)
- every: 10s
- info: amount of memory reported as used, \
- but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)
-
alarm: ram_in_use
on: system.ram
class: Utilization
@@ -20,7 +8,7 @@ component: Memory
component: Memory
os: linux
hosts: *
- calc: ($used - $used_ram_to_ignore) * 100 / ($used + $cached + $free + $buffers)
+ calc: $used * 100 / ($used + $cached + $free + $buffers)
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (80) : (90))
@@ -66,7 +54,7 @@ host labels: _is_k8s_node = false
component: Memory
os: freebsd
hosts: *
- calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
+ calc: ($active + $wired + $laundry + $buffers) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (80) : (90))
@@ -82,7 +70,7 @@ component: Memory
component: Memory
os: freebsd
hosts: *
- calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
+ calc: ($free + $inactive + $cache) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
units: %
every: 10s
warn: $this < (($status >= $WARNING) ? (15) : (10))
diff --git a/health/health_config.c b/health/health_config.c
index e1f5f0e31..df6d7b609 100644
--- a/health/health_config.c
+++ b/health/health_config.c
@@ -109,7 +109,7 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL
&& !strcmp(t->name, rt->name)
&& !strcmp(t->family_match?t->family_match:"*", rt->family_match?rt->family_match:"*")
)) {
- error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
+ info("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
return 0;
}
}
@@ -127,7 +127,7 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL
&& !strcmp(t->name, rt->name)
&& !strcmp(t->family_match?t->family_match:"*", rt->family_match?rt->family_match:"*")
)) {
- error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
+ info("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
return 0;
}
}
@@ -433,6 +433,9 @@ static inline int health_parse_db_lookup(
else if(!strcasecmp(key, "unaligned")) {
*options |= RRDR_OPTION_NOT_ALIGNED;
}
+ else if(!strcasecmp(key, "anomaly-bit")) {
+ *options |= RRDR_OPTION_ANOMALY_BIT;
+ }
else if(!strcasecmp(key, "match-ids") || !strcasecmp(key, "match_ids")) {
*options |= RRDR_OPTION_MATCH_IDS;
}
diff --git a/health/health_json.c b/health/health_json.c
index be95100bc..d5285c11e 100644
--- a/health/health_json.c
+++ b/health/health_json.c
@@ -165,6 +165,10 @@ static inline void health_rrdcalc_values2json_nolock(RRDHOST *host, BUFFER *wb,
buffer_rrd_value(wb, rc->value);
buffer_strcat(wb, ",\n");
+ buffer_strcat(wb, "\t\t\t\"last_updated\":");
+ buffer_sprintf(wb, "%lu", (unsigned long)rc->last_updated);
+ buffer_strcat(wb, ",\n");
+
buffer_sprintf(wb,
"\t\t\t\"status\": \"%s\"\n"
, rrdcalc_status2string(rc->status));
@@ -227,6 +231,7 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
"\t\t\t\"crit_repeat_every\": \"%u\",\n"
"\t\t\t\"value_string\": \"%s\",\n"
"\t\t\t\"last_repeat\": \"%lu\",\n"
+ "\t\t\t\"times_repeat\": %lu,\n"
, rc->chart, rc->name
, (unsigned long)rc->id
, hash_id
@@ -259,6 +264,7 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
, rc->crit_repeat_every
, value_string
, (unsigned long)rc->last_repeat
+ , (unsigned long)rc->times_repeat
);
if(unlikely(rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)) {
@@ -338,6 +344,8 @@ void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* contexts, RRDCAL
for(rc = host->alarms; rc ; rc = rc->next) {
if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
continue;
+ if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset)))
+ continue;
if(unlikely(rc->rrdset && rc->rrdset->hash_context == simple_hash(tok)
&& !strcmp(rc->rrdset->context, tok)
&& ((status==RRDCALC_STATUS_RAISED)?(rc->status >= RRDCALC_STATUS_WARNING):rc->status == status)))
@@ -349,7 +357,8 @@ void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* contexts, RRDCAL
for(rc = host->alarms; rc ; rc = rc->next) {
if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
continue;
-
+ if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset)))
+ continue;
if(unlikely((status==RRDCALC_STATUS_RAISED)?(rc->status >= RRDCALC_STATUS_WARNING):rc->status == status))
numberOfAlarms++;
}
diff --git a/health/health_log.c b/health/health_log.c
index 6d63966c7..54f6dc9fc 100644
--- a/health/health_log.c
+++ b/health/health_log.c
@@ -162,7 +162,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
#ifdef ENABLE_ACLK
if (netdata_cloud_setting) {
- sql_queue_alarm_to_aclk(host, ae);
+ sql_queue_alarm_to_aclk(host, ae, 0);
}
#endif
}
@@ -560,10 +560,6 @@ inline void health_alarm_log(
) {
debug(D_HEALTH, "Health adding alarm log entry with id: %u", ae->unique_id);
- if(unlikely(alarm_entry_isrepeating(host, ae))) {
- error("Repeating alarms cannot be added to host's alarm log entries. It seems somewhere in the logic, API is being misused. Alarm id: %u", ae->alarm_id);
- return;
- }
// link it
netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
ae->next = host->health_log.alarms;
diff --git a/health/notifications/Makefile.am b/health/notifications/Makefile.am
index 46a6e472c..f026171a7 100644
--- a/health/notifications/Makefile.am
+++ b/health/notifications/Makefile.am
@@ -31,6 +31,7 @@ include awssns/Makefile.inc
include discord/Makefile.inc
include email/Makefile.inc
include flock/Makefile.inc
+include gotify/Makefile.inc
include hangouts/Makefile.inc
include irc/Makefile.inc
include kavenegar/Makefile.inc
diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in
index 287cabfef..38a69a0f3 100755
--- a/health/notifications/alarm-notify.sh.in
+++ b/health/notifications/alarm-notify.sh.in
@@ -38,6 +38,7 @@
# - Dynatrace Event by @illumine
# - Stackpulse Event by @thiagoftsm
# - Opsgenie by @thiaoftsm #9858
+# - Gotify by @coffeegrind123
# -----------------------------------------------------------------------------
# testing notifications
@@ -243,6 +244,7 @@ else
total_crit_alarms="${26}" # List of alarms in critical state
classification="${27}" # The class field from .conf files
edit_command_line="${28}" # The command to edit the alarm, with the line number
+ child_machine_guid="${29}" # If populated, the notification is sent for a child
fi
# -----------------------------------------------------------------------------
@@ -400,6 +402,10 @@ SEND_DYNATRACE=
# stackpulse configs
STACKPULSE_WEBHOOK=
+# gotify configs
+GOTIFY_APP_URL=
+GOTIFY_APP_TOKEN=
+
# opsgenie configs
OPSGENIE_API_KEY=
@@ -589,6 +595,9 @@ filter_recipient_by_criticality() {
# check matrix
{ [ -z "${MATRIX_HOMESERVER}" ] || [ -z "${MATRIX_ACCESSTOKEN}" ]; } && SEND_MATRIX="NO"
+# check gotify
+{ [ -z "${GOTIFY_APP_TOKEN}" ] || [ -z "${GOTIFY_APP_URL}" ]; } && SEND_GOTIFY="NO"
+
# check stackpulse
[ -z "${STACKPULSE_WEBHOOK}" ] && SEND_STACKPULSE="NO"
@@ -626,7 +635,8 @@ if [ "${SEND_PUSHOVER}" = "YES" ] ||
[ "${SEND_MSTEAMS}" = "YES" ] ||
[ "${SEND_DYNATRACE}" = "YES" ] ||
[ "${SEND_STACKPULSE}" = "YES" ] ||
- [ "${SEND_OPSGENIE}" = "YES" ]; then
+ [ "${SEND_OPSGENIE}" = "YES" ] ||
+ [ "${SEND_GOTIFY}" = "YES" ]; then
# if we need curl, check for the curl command
if [ -z "${curl}" ]; then
curl="$(command -v curl 2>/dev/null)"
@@ -656,6 +666,7 @@ if [ "${SEND_PUSHOVER}" = "YES" ] ||
SEND_DYNATRACE="NO"
SEND_STACKPULSE="NO"
SEND_OPSGENIE="NO"
+ SEND_GOTIFY="NO"
fi
fi
@@ -795,7 +806,8 @@ for method in "${SEND_EMAIL}" \
"${SEND_MSTEAMS}" \
"${SEND_DYNATRACE}" \
"${SEND_STACKPULSE}" \
- "${SEND_OPSGENIE}" ; do
+ "${SEND_OPSGENIE}" \
+ "${SEND_GOTIFY}" ; do
if [ "${method}" == "YES" ]; then
proceed=1
@@ -2278,6 +2290,45 @@ EOF
}
# -----------------------------------------------------------------------------
+# Gotify sender
+
+send_gotify() {
+ local payload httpcode priority
+ [ "${SEND_GOTIFY}" != "YES" ] && return 1
+
+ if [ -z "${GOTIFY_APP_TOKEN}" ] ; then
+ info "Can't send Gotify notification, because GOTIFY_APP_TOKEN is not defined"
+ return 1
+ fi
+
+ # priority for Gotify Android app
+ case "${status}" in
+ CRITICAL) priority=10 ;; # sound + vibration
+ WARNING) priority=4 ;; # sound
+ *) priority=1 ;; # notification only
+ esac
+
+ payload=$(cat <<EOF
+ {
+ "title" : "${status}, ${name} = ${value_string}, on ${host}",
+ "message" : "${date}: ${chart} ${value_string}",
+ "priority" : ${priority}
+ }
+EOF
+)
+
+ httpcode=$(docurl -X POST -H "Content-Type: application/json" -d "${payload}" "${GOTIFY_APP_URL}/message?token=${GOTIFY_APP_TOKEN}")
+ if [ "${httpcode}" = "200" ]; then
+ info "sent gotify notification for: ${host} ${chart}.${name} is ${status}"
+ else
+ error "failed to send gotify notification for: ${host} ${chart}.${name} is ${status}, with HTTP error code ${httpcode}."
+ return 1
+ fi
+
+ return 0
+}
+
+# -----------------------------------------------------------------------------
# prepare the content of the notification
# the url to send the user on click
@@ -2311,7 +2362,11 @@ if [ ${GOTOCLOUD} -eq 0 ]; then
else
# Temporarily disable alarm redirection, as the cloud endpoint no longer exists. This functionality will be restored after discussion on #9487. For now, just lead to netdata.cloud
# Re-allow alarm redirection, for alarms 2.0, new template
- goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentId=${NETDATA_REGISTRY_UNIQUE_ID}&${redirect_params}"
+ if [ -z "${child_machine_guid}" ]; then
+ goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentId=${NETDATA_REGISTRY_UNIQUE_ID}&${redirect_params}"
+ else
+ goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentId=${NETDATA_REGISTRY_UNIQUE_ID}&childId=${child_machine_guid}&${redirect_params}"
+ fi
fi
# the severity of the alarm
@@ -3467,6 +3522,11 @@ send_opsgenie
SENT_OPSGENIE=$?
# -----------------------------------------------------------------------------
+# send messages to Gotify
+send_gotify
+SENT_GOTIFY=$?
+
+# -----------------------------------------------------------------------------
# let netdata know
for state in "${SENT_EMAIL}" \
"${SENT_PUSHOVER}" \
@@ -3495,7 +3555,8 @@ for state in "${SENT_EMAIL}" \
"${SENT_MSTEAMS}" \
"${SENT_DYNATRACE}" \
"${SENT_STACKPULSE}" \
- "${SENT_OPSGENIE}"; do
+ "${SENT_OPSGENIE}" \
+ "${SENT_GOTIFY}"; do
if [ "${state}" -eq 0 ]; then
# we sent something
exit 0
diff --git a/health/notifications/gotify/Makefile.inc b/health/notifications/gotify/Makefile.inc
new file mode 100644
index 000000000..782559125
--- /dev/null
+++ b/health/notifications/gotify/Makefile.inc
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+# THIS IS NOT A COMPLETE Makefile
+# IT IS INCLUDED BY ITS PARENT'S Makefile.am
+# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT
+
+# install these files
+dist_noinst_DATA += \
+ gotify/README.md \
+ gotify/Makefile.inc \
+ $(NULL)
diff --git a/health/notifications/gotify/README.md b/health/notifications/gotify/README.md
new file mode 100644
index 000000000..c253c845c
--- /dev/null
+++ b/health/notifications/gotify/README.md
@@ -0,0 +1,62 @@
+<!--
+title: "Send notifications to Gotify"
+description: "Send alerts to your Gotify instance when an alert gets triggered in Netdata."
+sidebar_label: "Gotify"
+custom_edit_url: https://github.com/netdata/netdata/edit/master/health/notifications/gotify/README.md
+-->
+
+# Send notifications to Gotify
+
+[Gotify](https://gotify.net/) is a self-hosted push notification service created for sending and receiving messages in real time.
+
+## Configuring Gotify
+
+### Prerequisites
+
+To use Gotify as your notification service, you need an application token.
+You can generate a new token in the Gotify Web UI.
+
+### Configuration
+
+To set up Gotify in Netdata:
+
+1. Switch to your [config
+directory](/docs/configure/nodes.md) and edit the file `health_alarm_notify.conf` using the edit config script.
+
+ ```bash
+ ./edit-config health_alarm_notify.conf
+ ```
+
+2. Change the variable `GOTIFY_APP_TOKEN` to the application token you generated in the Gotify Web UI. Change
+`GOTIFY_APP_URL` to point to your Gotify instance.
+
+ ```conf
+ SEND_GOTIFY="YES"
+
+ # Application token
+ # Gotify instance url
+ GOTIFY_APP_TOKEN=XXXXXXXXXXXXXXX
+ GOTIFY_APP_URL=https://push.example.de/
+ ```
+
+ Changes to `health_alarm_notify.conf` do not require a Netdata restart.
+
+3. Test your Gotify notifications configuration by running the following commands, replacing `ROLE` with your preferred role:
+
+ ```sh
+ # become user netdata
+ sudo su -s /bin/bash netdata
+
+ # send a test alarm
+ /usr/libexec/netdata/plugins.d/alarm-notify.sh test ROLE
+ ```
+
+ 🟢 If everything works, you'll see alarms in Gotify:
+
+ ![Example alarm notifications in Gotify](https://user-images.githubusercontent.com/103264516/162509205-1e88e5d9-96b6-4f7f-9426-182776158128.png)
+
+ 🔴 If sending the test notifications fails, check `/var/log/netdata/error.log` to find the relevant error message:
+
+ ```log
+ 2020-09-03 23:07:00: alarm-notify.sh: ERROR: failed to send Gotify notification for: hades test.chart.test_alarm is CRITICAL, with HTTP error code 401.
+ ```
diff --git a/health/notifications/health_alarm_notify.conf b/health/notifications/health_alarm_notify.conf
index 873c7c353..b69c6d538 100755
--- a/health/notifications/health_alarm_notify.conf
+++ b/health/notifications/health_alarm_notify.conf
@@ -279,6 +279,16 @@ STACKPULSE_WEBHOOK=""
DEFAULT_RECIPIENT_STACKPULSE=""
#------------------------------------------------------------------------------
+# gotify global notification options
+SEND_GOTIFY="YES"
+
+# App token and url
+GOTIFY_APP_TOKEN=""
+GOTIFY_APP_URL=""
+
+DEFAULT_RECIPIENT_GOTIFY=""
+
+#------------------------------------------------------------------------------
# opsgenie global notification options
SEND_OPSGENIE="YES"
@@ -971,6 +981,8 @@ role_recipients_matrix[sysadmin]="${DEFAULT_RECIPIENT_MATRIX}"
role_recipients_stackpulse[sysadmin]="${DEFAULT_RECIPIENT_STACKPULSE}"
+role_recipients_gotify[sysadmin]="${DEFAULT_RECIPIENT_GOTIFY}"
+
# -----------------------------------------------------------------------------
# DNS related alarms
@@ -1028,6 +1040,8 @@ role_recipients_matrix[domainadmin]="${DEFAULT_RECIPIENT_MATRIX}"
role_recipients_stackpulse[domainadmin]="${DEFAULT_RECIPIENT_STACKPULSE}"
+role_recipients_gotify[domainadmin]="${DEFAULT_RECIPIENT_GOTIFY}"
+
# -----------------------------------------------------------------------------
# database servers alarms
# mysql, redis, memcached, postgres, etc
@@ -1086,6 +1100,8 @@ role_recipients_matrix[dba]="${DEFAULT_RECIPIENT_MATRIX}"
role_recipients_stackpulse[dba]="${DEFAULT_RECIPIENT_STACKPULSE}"
+role_recipients_gotify[dba]="${DEFAULT_RECIPIENT_GOTIFY}"
+
# -----------------------------------------------------------------------------
# web servers alarms
# apache, nginx, lighttpd, etc
@@ -1144,6 +1160,8 @@ role_recipients_matrix[webmaster]="${DEFAULT_RECIPIENT_MATRIX}"
role_recipients_stackpulse[webmaster]="${DEFAULT_RECIPIENT_STACKPULSE}"
+role_recipients_gotify[webmaster]="${DEFAULT_RECIPIENT_GOTIFY}"
+
# -----------------------------------------------------------------------------
# proxy servers alarms
# squid, etc
@@ -1202,6 +1220,8 @@ role_recipients_matrix[proxyadmin]="${DEFAULT_RECIPIENT_MATRIX}"
role_recipients_stackpulse[proxyadmin]="${DEFAULT_RECIPIENT_STACKPULSE}"
+role_recipients_gotify[proxyadmin]="${DEFAULT_RECIPIENT_GOTIFY}"
+
# -----------------------------------------------------------------------------
# peripheral devices
# UPS, photovoltaics, etc
@@ -1257,3 +1277,5 @@ role_recipients_opsgenie[sitemgr]="${DEFAULT_RECIPIENT_OPSGENIE}"
role_recipients_matrix[sitemgr]="${DEFAULT_RECIPIENT_MATRIX}"
role_recipients_stackpulse[sitemgr]="${DEFAULT_RECIPIENT_STACKPULSE}"
+
+role_recipients_gotify[sitemgr]="${DEFAULT_RECIPIENT_GOTIFY}"