summaryrefslogtreecommitdiffstats
path: root/src/health
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-08-26 08:15:24 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-08-26 08:15:35 +0000
commitf09848204fa5283d21ea43e262ee41aa578e1808 (patch)
treec62385d7adf209fa6a798635954d887f718fb3fb /src/health
parentReleasing debian version 1.46.3-2. (diff)
downloadnetdata-f09848204fa5283d21ea43e262ee41aa578e1808.tar.xz
netdata-f09848204fa5283d21ea43e262ee41aa578e1808.zip
Merging upstream version 1.47.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/health')
-rw-r--r--src/health/guides/httpcheck/httpcheck_web_service_bad_content.md2
-rw-r--r--src/health/guides/httpcheck/httpcheck_web_service_bad_status.md2
-rw-r--r--src/health/guides/httpcheck/httpcheck_web_service_slow.md2
-rw-r--r--src/health/guides/httpcheck/httpcheck_web_service_unreachable.md2
-rw-r--r--src/health/health.d/beanstalkd.conf30
-rw-r--r--src/health/health.d/docker.conf23
-rw-r--r--src/health/health.d/gearman.conf27
-rw-r--r--src/health/health.d/ipfs.conf4
-rw-r--r--src/health/health.d/x509check.conf7
-rw-r--r--src/health/health.d/zfs.conf19
-rw-r--r--src/health/health_event_loop.c195
-rw-r--r--src/health/health_log.c5
-rw-r--r--src/health/health_notifications.c11
-rw-r--r--src/health/health_prototypes.c9
-rwxr-xr-xsrc/health/notifications/alarm-notify.sh.in21
-rwxr-xr-xsrc/health/notifications/health_alarm_notify.conf1
-rw-r--r--src/health/notifications/telegram/README.md2
-rw-r--r--src/health/notifications/telegram/metadata.yaml2
18 files changed, 159 insertions, 205 deletions
diff --git a/src/health/guides/httpcheck/httpcheck_web_service_bad_content.md b/src/health/guides/httpcheck/httpcheck_web_service_bad_content.md
index 433425e09..cbf42694d 100644
--- a/src/health/guides/httpcheck/httpcheck_web_service_bad_content.md
+++ b/src/health/guides/httpcheck/httpcheck_web_service_bad_content.md
@@ -27,4 +27,4 @@ sudo ./edit-config go.d/httpcheck.conf
### Useful resources
-1. [HTTP endpoint monitoring with Netdata](/src/go/collectors/go.d.plugin/modules/httpcheck/integrations/http_endpoints.md) \ No newline at end of file
+1. [HTTP endpoint monitoring with Netdata](/src/go/plugin/go.d/modules/httpcheck/integrations/http_endpoints.md) \ No newline at end of file
diff --git a/src/health/guides/httpcheck/httpcheck_web_service_bad_status.md b/src/health/guides/httpcheck/httpcheck_web_service_bad_status.md
index 60fabd751..8ac06a57e 100644
--- a/src/health/guides/httpcheck/httpcheck_web_service_bad_status.md
+++ b/src/health/guides/httpcheck/httpcheck_web_service_bad_status.md
@@ -18,4 +18,4 @@ root@netdata # curl -v <your_http_endpoint>:<port>/<path>
### Useful resources
-1. [HTTP endpoint monitoring with Netdata](/src/go/collectors/go.d.plugin/modules/httpcheck/integrations/http_endpoints.md)
+1. [HTTP endpoint monitoring with Netdata](/src/go/plugin/go.d/modules/httpcheck/integrations/http_endpoints.md)
diff --git a/src/health/guides/httpcheck/httpcheck_web_service_slow.md b/src/health/guides/httpcheck/httpcheck_web_service_slow.md
index 4f962e155..8f46a0f14 100644
--- a/src/health/guides/httpcheck/httpcheck_web_service_slow.md
+++ b/src/health/guides/httpcheck/httpcheck_web_service_slow.md
@@ -14,5 +14,5 @@ To troubleshoot this issue, check for:
### Useful resources
-1. [HTTP endpoint monitoring with Netdata](/src/go/collectors/go.d.plugin/modules/httpcheck/integrations/http_endpoints.md)
+1. [HTTP endpoint monitoring with Netdata](/src/go/plugin/go.d/modules/httpcheck/integrations/http_endpoints.md)
diff --git a/src/health/guides/httpcheck/httpcheck_web_service_unreachable.md b/src/health/guides/httpcheck/httpcheck_web_service_unreachable.md
index c77d33c0b..306ce1fee 100644
--- a/src/health/guides/httpcheck/httpcheck_web_service_unreachable.md
+++ b/src/health/guides/httpcheck/httpcheck_web_service_unreachable.md
@@ -30,4 +30,4 @@ To troubleshoot this error, check the following:
### Useful resources
-1. [HTTP endpoint monitoring with Netdata](/src/go/collectors/go.d.plugin/modules/httpcheck/integrations/http_endpoints.md) \ No newline at end of file
+1. [HTTP endpoint monitoring with Netdata](/src/go/plugin/go.d/modules/httpcheck/integrations/http_endpoints.md) \ No newline at end of file
diff --git a/src/health/health.d/beanstalkd.conf b/src/health/health.d/beanstalkd.conf
index 0d37f28e0..51b280491 100644
--- a/src/health/health.d/beanstalkd.conf
+++ b/src/health/health.d/beanstalkd.conf
@@ -11,31 +11,5 @@ component: Beanstalk
warn: $this > 3
delay: up 0 down 5m multiplier 1.2 max 1h
summary: Beanstalk buried jobs
- info: Number of buried jobs across all tubes. \
- You need to manually kick them so they can be processed. \
- Presence of buried jobs in a tube does not affect new jobs.
- to: sysadmin
-
-# get the number of buried jobs per queue
-
-#template: beanstalk_tube_buried_jobs
-# on: beanstalk.jobs
-# calc: $buried
-# units: jobs
-# every: 10s
-# warn: $this > 0
-# crit: $this > 10
-# delay: up 0 down 5m multiplier 1.2 max 1h
-# info: the number of jobs buried per tube
-# to: sysadmin
-
-# get the current number of tubes
-
-#template: beanstalk_number_of_tubes
-# on: beanstalk.current_tubes
-# calc: $tubes
-# every: 10s
-# warn: $this < 5
-# delay: up 0 down 5m multiplier 1.2 max 1h
-# info: the current number of tubes on the server
-# to: sysadmin
+ info: Number of buried jobs across all tubes.
+ to: silent
diff --git a/src/health/health.d/docker.conf b/src/health/health.d/docker.conf
index 668614d4d..edb63a08c 100644
--- a/src/health/health.d/docker.conf
+++ b/src/health/health.d/docker.conf
@@ -1,4 +1,6 @@
- template: docker_container_unhealthy
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+template: docker_container_unhealthy
on: docker.container_health_status
class: Errors
type: Containers
@@ -10,3 +12,22 @@ component: Docker
summary: Docker container ${label:container_name} health
info: ${label:container_name} docker container health status is unhealthy
to: sysadmin
+
+# This alert monitors the status of Docker containers and triggers if any container is exited (down).
+# To enable this alert for specific containers, you need to modify the "chart labels" filter.
+# This filter uses Netdata's simple pattern matching syntax.
+
+ template: docker_container_down
+ on: docker.container_state
+ class: Errors
+ type: Containers
+ component: Docker
+chart labels: container_name=!*
+ units: status
+ every: 10s
+ lookup: average -10s of exited
+ warn: $this > 0
+ delay: down 1m multiplier 1.5 max 2h
+ summary: Docker container ${label:container_name} down
+ info: Docker container ${label:container_name} is currently not running
+ to: sysadmin
diff --git a/src/health/health.d/gearman.conf b/src/health/health.d/gearman.conf
index 78e1165d1..2b19105b5 100644
--- a/src/health/health.d/gearman.conf
+++ b/src/health/health.d/gearman.conf
@@ -1,14 +1,15 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
- template: gearman_workers_queued
- on: gearman.single_job
- class: Latency
- type: Computing
-component: Gearman
- lookup: average -10m unaligned match-names of Pending
- units: workers
- every: 10s
- warn: $this > 30000
- delay: down 5m multiplier 1.5 max 1h
- summary: Gearman queued jobs
- info: Average number of queued jobs over the last 10 minutes
- to: sysadmin
+# template: gearman_function_waiting_jobs
+# on: gearman.function_queued_jobs_activity
+# class: Latency
+# type: Computing
+#component: Gearman
+# lookup: average -10m unaligned of waiting
+# units: jobs
+# every: 10s
+# warn: $this > 30000
+# delay: down 5m multiplier 1.5 max 1h
+# summary: Waiting jobs for ${label:task_name} function
+# info: Average number of waiting jobs for ${label:function_name} function over the last 10 minutes
+# to: sysadmin
diff --git a/src/health/health.d/ipfs.conf b/src/health/health.d/ipfs.conf
index 4dfee3c7f..bc3b0b1ea 100644
--- a/src/health/health.d/ipfs.conf
+++ b/src/health/health.d/ipfs.conf
@@ -1,10 +1,10 @@
template: ipfs_datastore_usage
- on: ipfs.repo_size
+ on: ipfs.datastore_space_utilization
class: Utilization
type: Data Sharing
component: IPFS
- calc: $size * 100 / $avail
+ calc: $used
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (80) : (90))
diff --git a/src/health/health.d/x509check.conf b/src/health/health.d/x509check.conf
index 1d40c8602..38187326f 100644
--- a/src/health/health.d/x509check.conf
+++ b/src/health/health.d/x509check.conf
@@ -12,15 +12,16 @@ component: x509 certificates
summary: x509 certificate expiration for ${label:source}
info: Time until x509 certificate expires for ${label:source}
to: webmaster
-
+
template: x509check_revocation_status
on: x509check.revocation_status
class: Errors
type: Certificates
component: x509 certificates
calc: $revoked
+ units: status
every: 60s
- crit: $this != nan AND $this != 0
+ crit: $this == 1
summary: x509 certificate revocation status for ${label:source}
- info: x509 certificate revocation status (0: revoked, 1: valid) for ${label:source}
+ info: x509 certificate revocation status for ${label:source}
to: webmaster
diff --git a/src/health/health.d/zfs.conf b/src/health/health.d/zfs.conf
index 9c1f0018b..5c8065aa3 100644
--- a/src/health/health.d/zfs.conf
+++ b/src/health/health.d/zfs.conf
@@ -67,7 +67,7 @@ component: File system
type: System
component: File system
calc: $degraded
- units: boolean
+ units: status
every: 10s
warn: $this > 0
delay: down 1m multiplier 1.5 max 1h
@@ -81,10 +81,25 @@ component: File system
type: System
component: File system
calc: $faulted + $unavail
- units: boolean
+ units: status
every: 10s
crit: $this > 0
delay: down 1m multiplier 1.5 max 1h
summary: Critical ZFS pool ${label:pool} state
info: ZFS pool ${label:pool} state is faulted or unavail
to: sysadmin
+
+
+ template: zfs_vdev_health_state
+ on: zfspool.vdev_health_state
+ class: Errors
+ type: System
+component: File system
+ calc: $degraded + $faulted
+ units: status
+ every: 10s
+ warn: $this > 0
+ delay: down 1m multiplier 1.5 max 1h
+ summary: ZFS vdev ${label:vdev} pool ${label:pool} state
+ info: ZFS vdev ${label:vdev} state is faulted or degraded
+ to: sysadmin
diff --git a/src/health/health_event_loop.c b/src/health/health_event_loop.c
index 756ffa165..b50812f2a 100644
--- a/src/health/health_event_loop.c
+++ b/src/health/health_event_loop.c
@@ -101,26 +101,10 @@ static void health_sleep(time_t next_run, unsigned int loop __maybe_unused) {
}
}
-static void sql_health_postpone_queue_removed(RRDHOST *host __maybe_unused) {
-#ifdef ENABLE_ACLK
- if (netdata_cloud_enabled) {
- struct aclk_sync_cfg_t *wc = host->aclk_config;
- if (unlikely(!wc)) {
- return;
- }
-
- if (wc->alert_queue_removed >= 1) {
- wc->alert_queue_removed+=6;
- }
- }
-#endif
-}
-
static void health_execute_delayed_initializations(RRDHOST *host) {
health_plugin_init();
RRDSET *st;
- bool must_postpone = false;
if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION)) return;
rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION);
@@ -131,11 +115,8 @@ static void health_execute_delayed_initializations(RRDHOST *host) {
worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET);
health_prototype_alerts_for_rrdset_incrementally(st);
- must_postpone = true;
}
rrdset_foreach_done(st);
- if (must_postpone)
- sql_health_postpone_queue_removed(host);
}
static void health_initialize_rrdhost(RRDHOST *host) {
@@ -179,6 +160,50 @@ static inline int check_if_resumed_from_suspension(void) {
return ret;
}
+static void do_eval_expression(
+ RRDCALC *rc,
+ EVAL_EXPRESSION *expression,
+ const char *expression_type __maybe_unused,
+ size_t job_type,
+ RRDCALC_FLAGS error_type,
+ RRDCALC_STATUS *calc_status,
+ NETDATA_DOUBLE *result)
+{
+ if (!expression || (!calc_status && !result))
+ return;
+
+ worker_is_busy(job_type);
+
+ if (unlikely(!expression_evaluate(expression))) {
+ // calculation failed
+ rc->run_flags |= error_type;
+ if (result)
+ *result = NAN;
+
+ netdata_log_debug(D_HEALTH,
+ "Health on host '%s', alarm '%s.%s': %s expression failed with error: %s",
+ rrdhost_hostname(rc->rrdset->rrdhost), rrdcalc_chart_name(rc), rrdcalc_name(rc), expression_type,
+ expression_error_msg(expression)
+ );
+ return;
+ }
+ rc->run_flags &= ~error_type;
+ netdata_log_debug(D_HEALTH,
+ "Health on host '%s', alarm '%s.%s': %s expression gave value "
+ NETDATA_DOUBLE_FORMAT ": %s (source: %s)",
+ rrdhost_hostname(rc->rrdset->rrdhost),
+ rrdcalc_chart_name(rc),
+ rrdcalc_name(rc),
+ expression_type,
+ expression_result(expression),
+ expression_error_msg(expression),
+ rrdcalc_source(rc));
+ if (calc_status)
+ *calc_status = rrdcalc_value2status(expression_result(expression));
+ else
+ *result = expression_result(expression);
+}
+
static void health_event_loop(void) {
bool health_running_logged = false;
@@ -270,6 +295,13 @@ static void health_event_loop(void) {
}
worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK);
+#ifdef ENABLE_ACLK
+ if (netdata_cloud_enabled) {
+ struct aclk_sync_cfg_t *wc = host->aclk_config;
+ if (wc && wc->send_snapshot == 2)
+ continue;
+ }
+#endif
// the first loop is to lookup values from the db
foreach_rrdcalc_in_rrdhost_read(host, rc) {
@@ -314,11 +346,6 @@ static void health_event_loop(void) {
rc->last_status_change_value = rc->value;
rc->last_updated = now_tmp;
rc->value = NAN;
-
-#ifdef ENABLE_ACLK
- if (netdata_cloud_enabled)
- sql_queue_alarm_to_aclk(host, ae, true);
-#endif
}
}
}
@@ -404,36 +431,7 @@ static void health_event_loop(void) {
// ------------------------------------------------------------
// if there is calculation expression, run it
- if (unlikely(rc->config.calculation)) {
- worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL);
-
- if (unlikely(!expression_evaluate(rc->config.calculation))) {
- // calculation failed
- rc->value = NAN;
- rc->run_flags |= RRDCALC_FLAG_CALC_ERROR;
-
- netdata_log_debug(
- D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s",
- rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
- expression_parsed_as(rc->config.calculation), expression_error_msg(rc->config.calculation)
- );
- }
- else {
- rc->run_flags &= ~RRDCALC_FLAG_CALC_ERROR;
-
- netdata_log_debug(
- D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
- NETDATA_DOUBLE_FORMAT": %s (source: %s)",
- rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
- expression_parsed_as(rc->config.calculation),
- expression_result(rc->config.calculation),
- expression_error_msg(rc->config.calculation),
- rrdcalc_source(rc)
- );
-
- rc->value = expression_result(rc->config.calculation);
- }
- }
+ do_eval_expression(rc, rc->config.calculation, "calculation", WORKER_HEALTH_JOB_CALC_EVAL, RRDCALC_FLAG_CALC_ERROR, NULL, &rc->value);
}
foreach_rrdcalc_in_rrdhost_done(rc);
@@ -453,65 +451,8 @@ static void health_event_loop(void) {
RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED;
RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED;
- // --------------------------------------------------------
- // check the warning expression
-
- if (likely(rc->config.warning)) {
- worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL);
-
- if (unlikely(!expression_evaluate(rc->config.warning))) {
- // calculation failed
- rc->run_flags |= RRDCALC_FLAG_WARN_ERROR;
-
- netdata_log_debug(D_HEALTH,
- "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s",
- rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
- expression_error_msg(rc->config.warning)
- );
- } else {
- rc->run_flags &= ~RRDCALC_FLAG_WARN_ERROR;
- netdata_log_debug(D_HEALTH,
- "Health on host '%s', alarm '%s.%s': warning expression gave value "
- NETDATA_DOUBLE_FORMAT ": %s (source: %s)",
- rrdhost_hostname(host),
- rrdcalc_chart_name(rc),
- rrdcalc_name(rc),
- expression_result(rc->config.warning),
- expression_error_msg(rc->config.warning),
- rrdcalc_source(rc)
- );
- warning_status = rrdcalc_value2status(expression_result(rc->config.warning));
- }
- }
-
- // --------------------------------------------------------
- // check the critical expression
-
- if (likely(rc->config.critical)) {
- worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL);
-
- if (unlikely(!expression_evaluate(rc->config.critical))) {
- // calculation failed
- rc->run_flags |= RRDCALC_FLAG_CRIT_ERROR;
-
- netdata_log_debug(D_HEALTH,
- "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s",
- rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
- expression_error_msg(rc->config.critical)
- );
- } else {
- rc->run_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
- netdata_log_debug(D_HEALTH,
- "Health on host '%s', alarm '%s.%s': critical expression gave value "
- NETDATA_DOUBLE_FORMAT ": %s (source: %s)",
- rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
- expression_result(rc->config.critical),
- expression_error_msg(rc->config.critical),
- rrdcalc_source(rc)
- );
- critical_status = rrdcalc_value2status(expression_result(rc->config.critical));
- }
- }
+ do_eval_expression(rc, rc->config.warning, "warning", WORKER_HEALTH_JOB_WARNING_EVAL, RRDCALC_FLAG_WARN_ERROR, &warning_status, NULL);
+ do_eval_expression(rc, rc->config.critical, "critical", WORKER_HEALTH_JOB_CRITICAL_EVAL, RRDCALC_FLAG_CRIT_ERROR, &critical_status, NULL);
// --------------------------------------------------------
// decide the final alarm status
@@ -706,26 +647,18 @@ static void health_event_loop(void) {
wait_for_all_notifications_to_finish_before_allowing_health_to_be_cleaned_up();
break;
}
+ }
#ifdef ENABLE_ACLK
- if (netdata_cloud_enabled) {
- struct aclk_sync_cfg_t *wc = host->aclk_config;
- if (unlikely(!wc))
- continue;
-
- if (wc->alert_queue_removed == 1) {
- sql_queue_removed_alerts_to_aclk(host);
- } else if (wc->alert_queue_removed > 1) {
- wc->alert_queue_removed--;
- }
-
- if (wc->alert_checkpoint_req == 1) {
- aclk_push_alarm_checkpoint(host);
- } else if (wc->alert_checkpoint_req > 1) {
- wc->alert_checkpoint_req--;
- }
- }
-#endif
+ struct aclk_sync_cfg_t *wc = host->aclk_config;
+ if (wc && wc->send_snapshot == 1) {
+ wc->send_snapshot = 2;
+ rrdhost_flag_set(host, RRDHOST_FLAG_ACLK_STREAM_ALERTS);
}
+ else
+ if (process_alert_pending_queue(host))
+ rrdhost_flag_set(host, RRDHOST_FLAG_ACLK_STREAM_ALERTS);
+#endif
+
dfe_done(host);
// wait for all notifications to finish before allowing health to be cleaned up
diff --git a/src/health/health_log.c b/src/health/health_log.c
index b04f8f248..143b741bf 100644
--- a/src/health/health_log.c
+++ b/src/health/health_log.c
@@ -4,7 +4,8 @@
// ----------------------------------------------------------------------------
-inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
+inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae)
+{
sql_health_alarm_log_save(host, ae);
}
@@ -43,7 +44,7 @@ void health_log_alert_transition_with_trace(RRDHOST *host, ALARM_ENTRY *ae, int
};
ND_LOG_STACK_PUSH(lgs);
- errno = 0;
+ errno_clear();
ND_LOG_FIELD_PRIORITY priority = NDLP_INFO;
diff --git a/src/health/health_notifications.c b/src/health/health_notifications.c
index 79426f48c..85dd2d0d8 100644
--- a/src/health/health_notifications.c
+++ b/src/health/health_notifications.c
@@ -23,7 +23,13 @@ void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
if (!(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS))
return;
- spawn_wait_cmd(ae->exec_spawn_serial, &ae->exec_code, &ae->exec_run_timestamp);
+ if(!ae->popen_instance) {
+ // nd_log(NDLS_DAEMON, NDLP_ERR, "attempted to wait for the execution of alert that has not spawn a notification");
+ return;
+ }
+
+ ae->exec_code = spawn_popen_wait(ae->popen_instance);
+
netdata_log_debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
ae->flags &= ~HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
@@ -75,7 +81,6 @@ static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae)
alarm_notifications_in_progress.head = ae;
}
alarm_notifications_in_progress.tail = ae;
-
}
static bool prepare_command(BUFFER *wb,
@@ -462,7 +467,7 @@ void health_send_notification(RRDHOST *host, ALARM_ENTRY *ae, struct health_rais
netdata_log_debug(D_HEALTH, "executing command '%s'", command_to_run);
ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
- ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
+ ae->popen_instance = spawn_popen_run(command_to_run);
enqueue_alarm_notify_in_progress(ae);
health_alarm_log_save(host, ae);
} else {
diff --git a/src/health/health_prototypes.c b/src/health/health_prototypes.c
index c43096115..a8681a453 100644
--- a/src/health/health_prototypes.c
+++ b/src/health/health_prototypes.c
@@ -687,15 +687,6 @@ void health_apply_prototypes_to_host(RRDHOST *host) {
health_prototype_reset_alerts_for_rrdset(st);
}
rrdset_foreach_done(st);
-
-#ifdef ENABLE_ACLK
- if (netdata_cloud_enabled) {
- struct aclk_sync_cfg_t *wc = host->aclk_config;
- if (likely(wc)) {
- wc->alert_queue_removed = SEND_REMOVED_AFTER_HEALTH_LOOPS;
- }
- }
-#endif
}
void health_apply_prototypes_to_all_hosts(void) {
diff --git a/src/health/notifications/alarm-notify.sh.in b/src/health/notifications/alarm-notify.sh.in
index 9a5780de1..c7c44cb11 100755
--- a/src/health/notifications/alarm-notify.sh.in
+++ b/src/health/notifications/alarm-notify.sh.in
@@ -641,8 +641,12 @@ filter_recipient_by_criticality() {
;;
CLEAR)
- # remove tracking file
- [ -f "${tracking_file}" ] && rm "${tracking_file}"
+ if [ -f "${tracking_file}" ]; then
+ tracking_file_existed="yes"
+ rm "${tracking_file}"
+ else
+ tracking_file_existed=""
+ fi
# "noclear" modifier set, block notification
if [ "${mod_noclear}" == "1" ]; then
@@ -657,7 +661,7 @@ filter_recipient_by_criticality() {
fi
# "critical" modifier set, send notification if tracking file exists
- if [ "${mod_critical}" == "1" ] && [ -f "${tracking_file}" ]; then
+ if [ "${mod_critical}" == "1" ] && [ -n "${tracking_file_existed}" ]; then
debug "SEVERITY FILTERING for ${recipient_arg} VIA ${method}: ALLOW: recipient has been notified for this alarm in the past (no status change will be sent from now)"
return 0
fi
@@ -1515,13 +1519,20 @@ send_telegram() {
notify_telegram=1
notify_retries=${TELEGRAM_RETRIES_ON_LIMIT:-0}
+ IFS=":" read -r chatID threadID <<< "${chatid}"
+
+ # https://core.telegram.org/bots/api#sendmessage
+ api_url="https://api.telegram.org/bot${bottoken}/sendMessage?chat_id=${chatID}"
+ if [ -n "${threadID}" ]; then
+ api_url+="&message_thread_id=${threadID}"
+ fi
+
while [ ${notify_telegram} -eq 1 ]; do
- # https://core.telegram.org/bots/api#sendmessage
httpcode=$(docurl ${disableNotification} \
--data-urlencode "parse_mode=HTML" \
--data-urlencode "disable_web_page_preview=true" \
--data-urlencode "text=${emoji} ${message}" \
- "https://api.telegram.org/bot${bottoken}/sendMessage?chat_id=${chatid}")
+ "${api_url}")
notify_telegram=0
diff --git a/src/health/notifications/health_alarm_notify.conf b/src/health/notifications/health_alarm_notify.conf
index f3b67c9de..9dcec27ae 100755
--- a/src/health/notifications/health_alarm_notify.conf
+++ b/src/health/notifications/health_alarm_notify.conf
@@ -413,6 +413,7 @@ DEFAULT_RECIPIENT_KAVENEGAR=""
# multiple recipients can be given like this:
# "CHAT_ID_1 CHAT_ID_2 ..."
+# To send alerts to a specific topic within a chat, use `CHAT_ID:TOPIC_ID`.
# enable/disable sending telegram messages
SEND_TELEGRAM="YES"
diff --git a/src/health/notifications/telegram/README.md b/src/health/notifications/telegram/README.md
index e263d0bb5..90cca4214 100644
--- a/src/health/notifications/telegram/README.md
+++ b/src/health/notifications/telegram/README.md
@@ -55,7 +55,7 @@ The following options can be defined for this notification
|:----|:-----------|:-------|:--------:|
| SEND_TELEGRAM | Set `SEND_TELEGRAM` to YES | YES | yes |
| TELEGRAM_BOT_TOKEN | set `TELEGRAM_BOT_TOKEN` to your bot token. | | yes |
-| DEFAULT_RECIPIENT_TELEGRAM | Set `DEFAULT_RECIPIENT_TELEGRAM` to the chat ID you want the alert notifications to be sent to. You can define multiple chat IDs like this: -49999333322 -1009999222255. | | yes |
+| DEFAULT_RECIPIENT_TELEGRAM | Set the `DEFAULT_RECIPIENT_TELEGRAM` variable in your config file to your Telegram chat ID (find it with @myidbot). Separate multiple chat IDs with spaces. To send alerts to a specific topic within a chat, use `chatID:topicID`. | | yes |
##### DEFAULT_RECIPIENT_TELEGRAM
diff --git a/src/health/notifications/telegram/metadata.yaml b/src/health/notifications/telegram/metadata.yaml
index cc6d8c91e..daa45da72 100644
--- a/src/health/notifications/telegram/metadata.yaml
+++ b/src/health/notifications/telegram/metadata.yaml
@@ -40,7 +40,7 @@
required: true
- name: 'DEFAULT_RECIPIENT_TELEGRAM'
default_value: ''
- description: "Set `DEFAULT_RECIPIENT_TELEGRAM` to the chat ID you want the alert notifications to be sent to. You can define multiple chat IDs like this: -49999333322 -1009999222255."
+ description: "Set the `DEFAULT_RECIPIENT_TELEGRAM` variable in your config file to your Telegram chat ID (find it with @myidbot). Separate multiple chat IDs with spaces. To send alerts to a specific topic within a chat, use `chatID:topicID`."
required: true
detailed_description: |
All roles will default to this variable if left unconfigured.