From a30a849b78fa4fe8552141b7b2802d1af1b18c09 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 13 Oct 2019 10:36:33 +0200 Subject: Adding upstream version 1.18.0. Signed-off-by: Daniel Baumann --- health/Makefile.am | 4 + health/README.md | 44 +++++++- health/health.c | 63 +++++++---- health/health.d/dbengine.conf | 44 ++++---- health/health.d/gearman.conf | 22 ++++ health/health.d/hdfs.conf | 75 +++++++++++++ health/health.d/mysql.conf | 34 ++++++ health/health.d/vcsa.conf | 122 +++++++++++++++++++++ health/health.d/zookeeper.conf | 14 +++ health/health.h | 3 + health/health_config.c | 183 +++++++++++++++++++++++--------- health/notifications/README.md | 13 ++- health/notifications/alarm-notify.sh.in | 4 +- health/notifications/email/README.md | 24 ++++- 14 files changed, 549 insertions(+), 100 deletions(-) create mode 100644 health/health.d/gearman.conf create mode 100644 health/health.d/hdfs.conf create mode 100644 health/health.d/vcsa.conf create mode 100644 health/health.d/zookeeper.conf (limited to 'health') diff --git a/health/Makefile.am b/health/Makefile.am index e9fceddb7..a314b3516 100644 --- a/health/Makefile.am +++ b/health/Makefile.am @@ -42,7 +42,9 @@ dist_healthconfig_DATA = \ health.d/fping.conf \ health.d/ioping.conf \ health.d/fronius.conf \ + health.d/gearman.conf \ health.d/haproxy.conf \ + health.d/hdfs.conf \ health.d/httpcheck.conf \ health.d/ipc.conf \ health.d/ipfs.conf \ @@ -84,10 +86,12 @@ dist_healthconfig_DATA = \ health.d/tcp_resets.conf \ health.d/udp_errors.conf \ health.d/varnish.conf \ + health.d/vcsa.conf \ health.d/vsphere.conf \ health.d/web_log.conf \ health.d/wmi.conf \ health.d/x509check.conf \ health.d/zfs.conf \ + health.d/zookeeper.conf \ health.d/dbengine.conf \ $(NULL) diff --git a/health/README.md b/health/README.md index ab8d6882a..0ffbbdb51 100644 --- a/health/README.md +++ b/health/README.md @@ -163,7 +163,7 @@ This line makes a database lookup to find a value. This result of this lookup is The format is: ``` -lookup: METHOD AFTER [at BEFORE] [every DURATION] [OPTIONS] [of DIMENSIONS] +lookup: METHOD AFTER [at BEFORE] [every DURATION] [OPTIONS] [of DIMENSIONS] [foreach DIMENSIONS] ``` Everything is the same with [badges](../web/api/badges/). In short: @@ -190,6 +190,11 @@ Everything is the same with [badges](../web/api/badges/). In short: have spaces in their names). This accepts Netdata simple patterns and the `match-ids` and `match-names` options affect the searches for dimensions. +- `foreach DIMENSIONS` is optional, will always be the last parameter, and uses the same `,`/`|` + rules as the `of` parameter. Each dimension you specify in `foreach` will use the same rule + to trigger an alarm. If you set both `of` and `foreach`, Netdata will ignore the `of` parameter + and replace it with one of the dimensions you gave to `foreach`. + The result of the lookup will be available as `$this` and `$NAME` in expressions. The timestamps of the timeframe evaluated by the database lookup is available as variables `$after` and `$before` (both are unix timestamps). @@ -660,6 +665,43 @@ Note that the drops chart does not exist if a network interface has never droppe When Netdata detects a dropped packet, it will add the chart and it will automatically attach this alarm to it. +### Example 5 + +Check if user or system dimension is using more than 50% of cpu: + +``` + alarm: dim_template + on: system.cpu + os: linux +lookup: average -3s percentage foreach system,user + units: % + every: 10s + warn: $this > 50 + crit: $this > 80 +``` + +The `lookup` line will calculate the average CPU usage from system and user in the last 3 seconds. Because we have +the foreach in the `lookup` line, Netdata will create two independent alarms called `dim_template_system` +and `dim_template_user` that will have all the other parameters shared among them. + +### Example 6 + +Check if all dimensions are using more than 50% of cpu: + +``` + alarm: dim_template + on: system.cpu + os: linux +lookup: average -3s percentage foreach * + units: % + every: 10s + warn: $this > 50 + crit: $this > 80 +``` + +The `lookup` line will calculate the average of CPU usage from system and user in the last 3 seconds. In this case +Netdata will create alarms for all dimensions of the chart. + ## Troubleshooting You can compile Netdata with [debugging](../daemon#debugging) and then set in `netdata.conf`: diff --git a/health/health.c b/health/health.c index 1460b5ba4..329191fb8 100644 --- a/health/health.c +++ b/health/health.c @@ -45,32 +45,30 @@ inline char *health_stock_config_dir(void) { * Function used to initialize the silencer structure. */ void health_silencers_init(void) { - struct stat statbuf; - if (!stat(silencers_filename,&statbuf)) { - off_t length = statbuf.st_size; - if (length && length < HEALTH_SILENCERS_MAX_FILE_LEN) { - FILE *fd = fopen(silencers_filename, "r"); - if (fd) { - char *str = mallocz((length+1)* sizeof(char)); - if(str) { - size_t copied; - copied = fread(str, sizeof(char), length, fd); - if (copied == (length* sizeof(char))) { - str[length] = 0x00; - json_parse(str, NULL, health_silencers_json_read_callback); - info("Parsed health silencers file %s", silencers_filename); - } else { - error("Cannot read the data from health silencers file %s", silencers_filename); - } - freez(str); + FILE *fd = fopen(silencers_filename, "r"); + if (fd) { + fseek(fd, 0 , SEEK_END); + off_t length = (off_t) ftell(fd); + fseek(fd, 0 , SEEK_SET); + + if (length > 0 && length < HEALTH_SILENCERS_MAX_FILE_LEN) { + char *str = mallocz((length+1)* sizeof(char)); + if(str) { + size_t copied; + copied = fread(str, sizeof(char), length, fd); + if (copied == (length* sizeof(char))) { + str[length] = 0x00; + json_parse(str, NULL, health_silencers_json_read_callback); + info("Parsed health silencers file %s", silencers_filename); + } else { + error("Cannot read the data from health silencers file %s", silencers_filename); } - fclose(fd); - } else { - error("Cannot open the file %s",silencers_filename); + freez(str); } } else { error("Health silencers file %s has the size %ld that is out of range[ 1 , %d ]. Aborting read.", silencers_filename, length, HEALTH_SILENCERS_MAX_FILE_LEN); } + fclose(fd); } else { error("Cannot open the file %s",silencers_filename); } @@ -115,9 +113,23 @@ void health_reload_host(RRDHOST *host) { while(host->templates) rrdcalctemplate_unlink_and_free(host, host->templates); + RRDCALCTEMPLATE *rt,*next; + for(rt = host->alarms_template_with_foreach; rt ; rt = next) { + next = rt->next; + rrdcalctemplate_free(rt); + } + host->alarms_template_with_foreach = NULL; + while(host->alarms) rrdcalc_unlink_and_free(host, host->alarms); + RRDCALC *rc,*nc; + for(rc = host->alarms_with_foreach; rc ; rc = nc) { + nc = rc->next; + rrdcalc_free(rc); + } + host->alarms_with_foreach = NULL; + rrdhost_unlock(host); // invalidate all previous entries in the alarm log @@ -141,9 +153,17 @@ void health_reload_host(RRDHOST *host) { health_readdir(host, user_path, stock_path, NULL); // link the loaded alarms to their charts + RRDDIM *rd; rrdset_foreach_write(st, host) { rrdsetcalc_link_matching(st); rrdcalctemplate_link_matching(st); + + //This loop must be the last, because ` rrdcalctemplate_link_matching` will create alarms related to it. + rrdset_rdlock(st); + rrddim_foreach_read(rd, st) { + rrdcalc_link_to_rrddim(rd, st, host); + } + rrdset_unlock(st); } rrdhost_unlock(host); @@ -890,6 +910,7 @@ void *health_main(void *ptr) { } } } + if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) { rc->last_repeat = now; ALARM_ENTRY *ae = health_create_alarm_entry( diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf index 956abf294..ce6427cd2 100644 --- a/health/health.d/dbengine.conf +++ b/health/health.d/dbengine.conf @@ -1,26 +1,26 @@ # you can disable an alarm notification by setting the 'to' line to: silent - alarm: 10min_dbengine_global_fs_errors - on: netdata.dbengine_global_errors - os: linux freebsd macos - hosts: * - lookup: sum -10m unaligned of FS errors - units: errors - every: 10s - crit: $this > 0 - delay: down 15m multiplier 1.5 max 1h - info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc) - to: sysadmin + alarm: 10min_dbengine_global_fs_errors + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * +lookup: sum -10m unaligned of FS errors + units: errors + every: 10s + crit: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc) + to: sysadmin - alarm: 10min_dbengine_global_io_errors - on: netdata.dbengine_global_errors - os: linux freebsd macos - hosts: * - lookup: sum -10m unaligned of I/O errors - units: errors - every: 10s - crit: $this > 0 - delay: down 1h multiplier 1.5 max 3h - info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc) - to: sysadmin + alarm: 10min_dbengine_global_io_errors + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * +lookup: sum -10m unaligned of I/O errors + units: errors + every: 10s + crit: $this > 0 + delay: down 1h multiplier 1.5 max 3h + info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc) + to: sysadmin \ No newline at end of file diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf new file mode 100644 index 000000000..e3863ae5e --- /dev/null +++ b/health/health.d/gearman.conf @@ -0,0 +1,22 @@ +# make sure Gearman is running +template: gearman_last_collected_secs + on: gearman.total_jobs + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +template: gearman_workers_queued + on: gearman.single_job + lookup: average -10m unaligned match-names of Queued + units: workers + every: 10s + warn: $this > 30000 + crit: $this > 100000 + delay: down 5m multiplier 1.5 max 1h + info: number of queued jobs + to: sysadmin \ No newline at end of file diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf new file mode 100644 index 000000000..678faab4c --- /dev/null +++ b/health/health.d/hdfs.conf @@ -0,0 +1,75 @@ + +# make sure hdfs is running + +template: hdfs_last_collected_secs + on: hdfs.heap_memory + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + + +# Common + +template: hdfs_capacity_usage + on: hdfs.capacity + calc: ($used) * 100 / ($used + $remaining) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: used capacity + to: sysadmin + + +# NameNode + +template: hdfs_missing_blocks + on: hdfs.blocks + calc: $missing + units: missing blocks + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: missing blocks + to: sysadmin + + +template: hdfs_stale_nodes + on: hdfs.data_nodes + calc: $stale + units: dead nodes + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: stale data nodes + to: sysadmin + + +template: hdfs_dead_nodes + on: hdfs.data_nodes + calc: $dead + units: dead nodes + every: 10s + crit: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: dead data nodes + to: sysadmin + + +# DataNode + +template: hdfs_num_failed_volumes + on: hdfs.num_failed_volumes + calc: $fsds_num_failed_volumes + units: failed volumes + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: failed volumes + to: sysadmin diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf index 39c401915..ce7b98a87 100644 --- a/health/health.d/mysql.conf +++ b/health/health.d/mysql.conf @@ -98,3 +98,37 @@ template: mysql_replication_lag info: the number of seconds mysql replication is behind this master to: dba + +# ----------------------------------------------------------------------------- +# galera cluster size + +template: mysql_galera_cluster_size_max_2m + on: mysql.galera_cluster_size + lookup: max -2m absolute + units: nodes + every: 10s + info: max cluster size 2 minute + to: dba + +template: mysql_galera_cluster_size + on: mysql.galera_cluster_size + calc: $nodes + units: nodes + every: 10s + warn: $this > $mysql_galera_cluster_size_max_2m + crit: $this < $mysql_galera_cluster_size_max_2m + delay: up 20s down 5m multiplier 1.5 max 1h + info: cluster size has changed + to: dba + +# galera node state + +template: mysql_galera_cluster_state + on: mysql.galera_cluster_state + calc: $state + every: 10s + warn: $this < 4 + crit: $this < 2 + delay: up 30s down 5m multiplier 1.5 max 1h + info: node state (0: undefined, 1: joining, 2: donor/desynced, 3: joined, 4: synced) + to: dba diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf new file mode 100644 index 000000000..7bb98a9ba --- /dev/null +++ b/health/health.d/vcsa.conf @@ -0,0 +1,122 @@ + +# make sure vcsa is running and responding + +template: vcsa_last_collected_secs + on: vcsa.system_health + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# Overall system health: +# - 0: all components are healthy. +# - 1: one or more components might become overloaded soon. +# - 2: one or more components in the appliance might be degraded. +# - 3: one or more components might be in an unusable status and the appliance might become unresponsive soon. +# - 4: no health data is available. + +template: vcsa_system_health + on: vcsa.system_health + lookup: max -10s unaligned of system + units: status + every: 10s + warn: ($this == 1) || ($this == 2) + crit: $this == 3 + delay: down 1m multiplier 1.5 max 1h + info: overall system health status + to: sysadmin + +# Components health: +# - 0: healthy. +# - 1: healthy, but may have some problems. +# - 2: degraded, and may have serious problems. +# - 3: unavailable, or will stop functioning soon. +# - 4: no health data is available. + +template: vcsa_swap_health + on: vcsa.components_health + lookup: max -10s unaligned of swap + units: status + every: 10s + warn: $this == 1 + crit: ($this == 2) || ($this == 3) + delay: down 1m multiplier 1.5 max 1h + info: swap health status + to: sysadmin + +template: vcsa_storage_health + on: vcsa.components_health + lookup: max -10s unaligned of storage + units: status + every: 10s + warn: $this == 1 + crit: ($this == 2) || ($this == 3) + delay: down 1m multiplier 1.5 max 1h + info: storage health status + to: sysadmin + +template: vcsa_mem_health + on: vcsa.components_health + lookup: max -10s unaligned of mem + units: status + every: 10s + warn: $this == 1 + crit: ($this == 2) || ($this == 3) + delay: down 1m multiplier 1.5 max 1h + info: mem health status + to: sysadmin + +template: vcsa_load_health + on: vcsa.components_health + lookup: max -10s unaligned of load + units: status + every: 10s + warn: $this == 1 + crit: ($this == 2) || ($this == 3) + delay: down 1m multiplier 1.5 max 1h + info: load health status + to: sysadmin + +template: vcsa_database_storage_health + on: vcsa.components_health + lookup: max -10s unaligned of database_storage + units: status + every: 10s + warn: $this == 1 + crit: ($this == 2) || ($this == 3) + delay: down 1m multiplier 1.5 max 1h + info: database storage health status + to: sysadmin + +template: vcsa_applmgmt_health + on: vcsa.components_health + lookup: max -10s unaligned of applmgmt + units: status + every: 10s + warn: $this == 1 + crit: ($this == 2) || ($this == 3) + delay: down 1m multiplier 1.5 max 1h + info: appl mgmt health status + to: sysadmin + + +# Software updates health: +# - 0: no updates available. +# - 2: non-security updates are available. +# - 3: security updates are available. +# - 4: an error retrieving information on software updates. + +template: vcsa_software_updates_health + on: vcsa.software_updates_health + lookup: max -10s unaligned of software_packages + units: status + every: 10s + warn: $this == 4 + crit: $this == 3 + delay: down 1m multiplier 1.5 max 1h + info: software packages health status + to: sysadmin diff --git a/health/health.d/zookeeper.conf b/health/health.d/zookeeper.conf new file mode 100644 index 000000000..ffbe31baf --- /dev/null +++ b/health/health.d/zookeeper.conf @@ -0,0 +1,14 @@ + +# make sure zookeeper is running + +template: zookeeper_last_collected_secs + on: zookeeper.requests + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + diff --git a/health/health.h b/health/health.h index 8e4d0f7cb..ab367e903 100644 --- a/health/health.h +++ b/health/health.h @@ -48,6 +48,7 @@ extern unsigned int default_health_enabled; #define HEALTH_INFO_KEY "info" #define HEALTH_DELAY_KEY "delay" #define HEALTH_OPTIONS_KEY "options" +#define HEALTH_FOREACH_KEY "foreach" #define HEALTH_SILENCERS_MAX_FILE_LEN 10000 @@ -106,4 +107,6 @@ extern void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae); extern void *health_cmdapi_thread(void *ptr); +extern SIMPLE_PATTERN *health_pattern_from_foreach(char *s); + #endif //NETDATA_HEALTH_H diff --git a/health/health_config.c b/health/health_config.c index 0d6e77a9e..65c6d8bd7 100644 --- a/health/health_config.c +++ b/health/health_config.c @@ -46,7 +46,7 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) { rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->name, &rc->next_event_id); - debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u", + debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u", rc->chart?rc->chart:"NOCHART", rc->name, rc->id, @@ -59,6 +59,7 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) { rc->before, rc->options, (rc->dimensions)?rc->dimensions:"NONE", + (rc->foreachdim)?rc->foreachdim:"NONE", rc->update_every, (rc->calculation)?rc->calculation->parsed_as:"NONE", (rc->warning)?rc->warning->parsed_as:"NONE", @@ -73,6 +74,7 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) { ); rrdcalc_add_to_host(host, rc); + return 1; } @@ -93,48 +95,70 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL } RRDCALCTEMPLATE *t, *last = NULL; - for (t = host->templates; t ; last = t, t = t->next) { - if(unlikely(t->hash_name == rt->hash_name - && !strcmp(t->name, rt->name) - && !strcmp(t->family_match?t->family_match:"*", rt->family_match?rt->family_match:"*") - )) { - error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname); - return 0; + if(!rt->foreachdim) { + for (t = host->templates; t ; last = t, t = t->next) { + if(unlikely(t->hash_name == rt->hash_name + && !strcmp(t->name, rt->name) + && !strcmp(t->family_match?t->family_match:"*", rt->family_match?rt->family_match:"*") + )) { + error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname); + return 0; + } + } + + if(likely(last)) { + last->next = rt; + } + else { + rt->next = host->templates; + host->templates = rt; + } + } else { + for (t = host->alarms_template_with_foreach; t ; last = t, t = t->next) { + if(unlikely(t->hash_name == rt->hash_name + && !strcmp(t->name, rt->name) + && !strcmp(t->family_match?t->family_match:"*", rt->family_match?rt->family_match:"*") + )) { + error("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname); + return 0; + } + } + + if(likely(last)) { + last->next = rt; + } + else { + rt->next = host->alarms_template_with_foreach; + host->alarms_template_with_foreach = rt; } } - debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u", - rt->name, - (rt->context)?rt->context:"NONE", - (rt->exec)?rt->exec:"DEFAULT", - (rt->recipient)?rt->recipient:"DEFAULT", - rt->green, - rt->red, - (int)rt->group, - rt->after, - rt->before, - rt->options, - (rt->dimensions)?rt->dimensions:"NONE", - rt->update_every, - (rt->calculation)?rt->calculation->parsed_as:"NONE", - (rt->warning)?rt->warning->parsed_as:"NONE", - (rt->critical)?rt->critical->parsed_as:"NONE", - rt->source, - rt->delay_up_duration, - rt->delay_down_duration, - rt->delay_max_duration, - rt->delay_multiplier, - rt->warn_repeat_every, - rt->crit_repeat_every + debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u", + rt->name, + (rt->context)?rt->context:"NONE", + (rt->exec)?rt->exec:"DEFAULT", + (rt->recipient)?rt->recipient:"DEFAULT", + rt->green, + rt->red, + (int)rt->group, + rt->after, + rt->before, + rt->options, + (rt->dimensions)?rt->dimensions:"NONE", + (rt->foreachdim)?rt->foreachdim:"NONE", + rt->update_every, + (rt->calculation)?rt->calculation->parsed_as:"NONE", + (rt->warning)?rt->warning->parsed_as:"NONE", + (rt->critical)?rt->critical->parsed_as:"NONE", + rt->source, + rt->delay_up_duration, + rt->delay_down_duration, + rt->delay_max_duration, + rt->delay_multiplier, + rt->warn_repeat_every, + rt->crit_repeat_every ); - if(likely(last)) { - last->next = rt; - } - else { - rt->next = host->templates; - host->templates = rt; - } return 1; } @@ -291,16 +315,37 @@ static inline int health_parse_repeat( return 1; } +/** + * Health pattern from Foreach + * + * Create a new simple pattern using the user input + * + * @param s the string that will be used to create the simple pattern. + */ +SIMPLE_PATTERN *health_pattern_from_foreach(char *s) { + char *convert= strdupz(s); + SIMPLE_PATTERN *val = NULL; + if(convert) { + dimension_remove_pipe_comma(convert); + val = simple_pattern_create(convert, NULL, SIMPLE_PATTERN_EXACT); + + freez(convert); + } + + return val; +} static inline int health_parse_db_lookup( size_t line, const char *filename, char *string, RRDR_GROUPING *group_method, int *after, int *before, int *every, - uint32_t *options, char **dimensions + uint32_t *options, char **dimensions, char **foreachdim ) { debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s: %s", line, filename, string); if(*dimensions) freez(*dimensions); + if(*foreachdim) freez(*foreachdim); *dimensions = NULL; + *foreachdim = NULL; *after = 0; *before = 0; *every = 0; @@ -387,8 +432,22 @@ static inline int health_parse_db_lookup( *options |= RRDR_OPTION_MATCH_NAMES; } else if(!strcasecmp(key, "of")) { - if(*s && strcasecmp(s, "all") != 0) + char *find = NULL; + if(*s && strcasecmp(s, "all") != 0) { + find = strcasestr(s, " foreach"); + if(find) { + *find = '\0'; + } *dimensions = strdupz(s); + } + + if(!find) { + break; + } + s = ++find; + } + else if(!strcasecmp(key, HEALTH_FOREACH_KEY )) { + *foreachdim = strdupz(s); break; } else { @@ -521,8 +580,12 @@ static int health_readfile(const char *filename, void *data) { uint32_t hash = simple_uhash(key); if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) { - if (rc && (ignore_this || !rrdcalc_add_alarm_from_config(host, rc))) - rrdcalc_free(rc); + if(rc) { + if(ignore_this || !rrdcalc_add_alarm_from_config(host, rc)) { + rrdcalc_free(rc); + } + // health_add_alarms_loop(host, rc, ignore_this) ; + } if(rt) { if (ignore_this || !rrdcalctemplate_add_template_from_config(host, rt)) @@ -552,14 +615,18 @@ static int health_readfile(const char *filename, void *data) { } else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) { if(rc) { - if(ignore_this || !rrdcalc_add_alarm_from_config(host, rc)) +// health_add_alarms_loop(host, rc, ignore_this) ; + if(ignore_this || !rrdcalc_add_alarm_from_config(host, rc)) { rrdcalc_free(rc); + } rc = NULL; } - if(rt && (ignore_this || !rrdcalctemplate_add_template_from_config(host, rt))) - rrdcalctemplate_free(rt); + if(rt) { + if(ignore_this || !rrdcalctemplate_add_template_from_config(host, rt)) + rrdcalctemplate_free(rt); + } rt = callocz(1, sizeof(RRDCALCTEMPLATE)); rt->name = strdupz(value); @@ -622,8 +689,10 @@ static int health_readfile(const char *filename, void *data) { } else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) { health_parse_db_lookup(line, filename, value, &rc->group, &rc->after, &rc->before, - &rc->update_every, - &rc->options, &rc->dimensions); + &rc->update_every, &rc->options, &rc->dimensions, &rc->foreachdim); + if(rc->foreachdim) { + rc->spdim = health_pattern_from_foreach(rc->foreachdim); + } } else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) { if(!config_parse_duration(value, &rc->update_every)) @@ -752,7 +821,10 @@ static int health_readfile(const char *filename, void *data) { } else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) { health_parse_db_lookup(line, filename, value, &rt->group, &rt->after, &rt->before, - &rt->update_every, &rt->options, &rt->dimensions); + &rt->update_every, &rt->options, &rt->dimensions, &rt->foreachdim); + if(rt->foreachdim) { + rt->spdim = health_pattern_from_foreach(rt->foreachdim); + } } else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) { if(!config_parse_duration(value, &rt->update_every)) @@ -866,11 +938,17 @@ static int health_readfile(const char *filename, void *data) { } } - if(rc && (ignore_this || !rrdcalc_add_alarm_from_config(host, rc))) - rrdcalc_free(rc); + if(rc) { + //health_add_alarms_loop(host, rc, ignore_this) ; + if(ignore_this || !rrdcalc_add_alarm_from_config(host, rc)) { + rrdcalc_free(rc); + } + } - if(rt && (ignore_this || !rrdcalctemplate_add_template_from_config(host, rt))) - rrdcalctemplate_free(rt); + if(rt) { + if(ignore_this || !rrdcalctemplate_add_template_from_config(host, rt)) + rrdcalctemplate_free(rt); + } fclose(fp); return 1; @@ -881,5 +959,6 @@ void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path debug(D_HEALTH, "CONFIG health is not enabled for host '%s'", host->hostname); return; } + recursive_config_double_dir_load(user_path, stock_path, subpath, health_readfile, (void *) host, 0); } diff --git a/health/notifications/README.md b/health/notifications/README.md index a0065729a..c086e79b1 100644 --- a/health/notifications/README.md +++ b/health/notifications/README.md @@ -39,7 +39,18 @@ by running `/etc/netdata/edit-config health_alarm_notify.conf`: all notification methods except email, require some configuration (i.e. API keys, tokens, destination rooms, channels, etc). -2. **recipients** per **role** per **notification method** +- **recipients** per **role** per **notification method** + +```sh +grep sysadmin /etc/netdata/health_alarm_notify.conf + +role_recipients_email[sysadmin]="${DEFAULT_RECIPIENT_EMAIL}" +role_recipients_pushover[sysadmin]="${DEFAULT_RECIPIENT_PUSHOVER}" +role_recipients_pushbullet[sysadmin]="${DEFAULT_RECIPIENT_PUSHBULLET}" +role_recipients_telegram[sysadmin]="${DEFAULT_RECIPIENT_TELEGRAM}" +role_recipients_slack[sysadmin]="${DEFAULT_RECIPIENT_SLACK}" +... +``` ## Testing Notifications diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in index bbb960918..509a8e88d 100755 --- a/health/notifications/alarm-notify.sh.in +++ b/health/notifications/alarm-notify.sh.in @@ -194,7 +194,7 @@ fi # ----------------------------------------------------------------------------- # parse command line parameters -if [ ${1} = "unittest" ]; then +if [[ ${1} = "unittest" ]]; then unittest=1 # enable unit testing mode roles="${2}" # the role that should be used for unit testing cfgfile="${3}" # the location of the config file to use for unit testing @@ -1783,7 +1783,7 @@ url_family="${REPLY}" urlencode "${name}" >/dev/null url_name="${REPLY}" -redirect_params="host=${url_host}&chart=${url_chart}&family=${url_family}&alarm=${url_name}&alarm_unique_id=${unique_id}&alarm_id=${alarm_id}&alarm_event_id=${event_id}" +redirect_params="host=${url_host}&chart=${url_chart}&family=${url_family}&alarm=${url_name}&alarm_unique_id=${unique_id}&alarm_id=${alarm_id}&alarm_event_id=${event_id}&alarm_when=${when}" GOTOCLOUD=0 if [ "${NETDATA_REGISTRY_URL}" == "https://registry.my-netdata.io" ]; then diff --git a/health/notifications/email/README.md b/health/notifications/email/README.md index 92916d192..bf03887ac 100644 --- a/health/notifications/email/README.md +++ b/health/notifications/email/README.md @@ -1,4 +1,4 @@ -# email +# Email You need a working `sendmail` command for email alerts to work. Almost all MTAs provide a `sendmail` interface. @@ -33,4 +33,26 @@ Where `[ROLE]` is the role you want to test. The default (if you don't give a `[ Note that in versions before 1.16, the plugins.d directory may be installed in a different location in certain OSs (e.g. under `/usr/lib/netdata`). You can always find the location of the alarm-notify.sh script in `netdata.conf`. +## Simple SMTP transport configuration + +If you want an alternative to `sendmail` in order to have a simple MTA configuration for sending emails and auth to an existing SMTP server, you can do the following: + +- Install `msmtp`. +- Modify the `sendmail` path in `health_alarm_notify.conf` to point to the location of `mstmp`: +``` +# The full path to the sendmail command. +# If empty, the system $PATH will be searched for it. +# If not found, email notifications will be disabled (silently). +sendmail="/usr/bin/msmtp" +``` +- Login as netdata : +```sh +(sudo) su -s /bin/bash netdata +``` +- Configure `~/.msmtprc` as shown [in the documentation](https://marlam.de/msmtp/documentation/). +- Finaly set the appropriate permissions on the `.msmtprc` file : +```sh +chmod 600 ~/.msmtprc +``` + [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2Fnotifications%2Femail%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) -- cgit v1.2.3