summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2021-12-01 06:15:11 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2021-12-01 06:15:11 +0000
commit483926a283e118590da3f9ecfa75a8a4d62143ce (patch)
treecb77052778df9a128a8cd3ff5bf7645322a13bc5 /health
parentReleasing debian version 1.31.0-4. (diff)
downloadnetdata-483926a283e118590da3f9ecfa75a8a4d62143ce.tar.xz
netdata-483926a283e118590da3f9ecfa75a8a4d62143ce.zip
Merging upstream version 1.32.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'health')
-rw-r--r--health/Makefile.am19
-rw-r--r--health/REFERENCE.md60
-rw-r--r--health/health.c73
-rw-r--r--health/health.d/adaptec_raid.conf8
-rw-r--r--health/health.d/am2320.conf15
-rw-r--r--health/health.d/anomalies.conf8
-rw-r--r--health/health.d/apcupsd.conf12
-rw-r--r--health/health.d/backend.conf12
-rw-r--r--health/health.d/bcache.conf8
-rw-r--r--health/health.d/beanstalkd.conf4
-rw-r--r--health/health.d/bind_rndc.conf4
-rw-r--r--health/health.d/boinc.conf16
-rw-r--r--health/health.d/btrfs.conf16
-rw-r--r--health/health.d/ceph.conf4
-rw-r--r--health/health.d/cgroups.conf8
-rw-r--r--health/health.d/cockroachdb.conf72
-rw-r--r--health/health.d/couchdb.conf16
-rw-r--r--health/health.d/cpu.conf16
-rw-r--r--health/health.d/dbengine.conf16
-rw-r--r--health/health.d/disks.conf26
-rw-r--r--health/health.d/dns_query.conf4
-rw-r--r--health/health.d/dnsmasq_dhcp.conf4
-rw-r--r--health/health.d/dockerd.conf4
-rw-r--r--health/health.d/elasticsearch.conf15
-rw-r--r--health/health.d/entropy.conf4
-rw-r--r--health/health.d/exporting.conf29
-rw-r--r--health/health.d/fping.conf16
-rw-r--r--health/health.d/fronius.conf4
-rw-r--r--health/health.d/gearman.conf20
-rw-r--r--health/health.d/geth.conf12
-rw-r--r--health/health.d/go.d.plugin.conf (renamed from health/health.d/nginx_plus.conf)14
-rw-r--r--health/health.d/haproxy.conf21
-rw-r--r--health/health.d/hdfs.conf37
-rw-r--r--health/health.d/httpcheck.conf46
-rw-r--r--health/health.d/ioping.conf4
-rw-r--r--health/health.d/ipc.conf8
-rw-r--r--health/health.d/ipfs.conf4
-rw-r--r--health/health.d/ipmi.conf8
-rw-r--r--health/health.d/kubelet.conf36
-rw-r--r--health/health.d/lighttpd.conf17
-rw-r--r--health/health.d/linux_power_supply.conf4
-rw-r--r--health/health.d/load.conf16
-rw-r--r--health/health.d/mdstat.conf16
-rw-r--r--health/health.d/megacli.conf20
-rw-r--r--health/health.d/memcached.conf29
-rw-r--r--health/health.d/memory.conf12
-rw-r--r--health/health.d/mongodb.conf16
-rw-r--r--health/health.d/mysql.conf62
-rw-r--r--health/health.d/named.conf17
-rw-r--r--health/health.d/net.conf60
-rw-r--r--health/health.d/netfilter.conf4
-rw-r--r--health/health.d/nginx.conf17
-rw-r--r--health/health.d/phpfpm.conf17
-rw-r--r--health/health.d/pihole.conf49
-rw-r--r--health/health.d/portcheck.conf26
-rw-r--r--health/health.d/postgres.conf16
-rw-r--r--health/health.d/processes.conf4
-rw-r--r--health/health.d/pulsar.conf16
-rw-r--r--health/health.d/python.d.plugin.conf (renamed from health/health.d/apache.conf)14
-rw-r--r--health/health.d/ram.conf48
-rw-r--r--health/health.d/redis.conf24
-rw-r--r--health/health.d/retroshare.conf19
-rw-r--r--health/health.d/riakkv.conf38
-rw-r--r--health/health.d/scaleio.conf24
-rw-r--r--health/health.d/softnet.conf12
-rw-r--r--health/health.d/squid.conf17
-rw-r--r--health/health.d/stiebeleltron.conf4
-rw-r--r--health/health.d/swap.conf10
-rw-r--r--health/health.d/systemdunits.conf40
-rw-r--r--health/health.d/tcp_conn.conf4
-rw-r--r--health/health.d/tcp_listen.conf16
-rw-r--r--health/health.d/tcp_mem.conf4
-rw-r--r--health/health.d/tcp_orphans.conf4
-rw-r--r--health/health.d/tcp_resets.conf16
-rw-r--r--health/health.d/timex.conf17
-rw-r--r--health/health.d/udp_errors.conf8
-rw-r--r--health/health.d/unbound.conf24
-rw-r--r--health/health.d/varnish.conf12
-rw-r--r--health/health.d/vcsa.conf48
-rw-r--r--health/health.d/vernemq.conf120
-rw-r--r--health/health.d/vsphere.conf44
-rw-r--r--health/health.d/web_log.conf135
-rw-r--r--health/health.d/whoisquery.conf21
-rw-r--r--health/health.d/wmi.conf50
-rw-r--r--health/health.d/x509check.conf25
-rw-r--r--health/health.d/zfs.conf12
-rw-r--r--health/health.d/zookeeper.conf17
-rw-r--r--health/health.h6
-rw-r--r--health/health_config.c153
-rw-r--r--health/health_json.c18
-rw-r--r--health/health_log.c64
-rwxr-xr-xhealth/notifications/alarm-notify.sh.in888
-rw-r--r--health/notifications/custom/README.md6
-rw-r--r--health/notifications/email/README.md18
-rwxr-xr-xhealth/notifications/health_alarm_notify.conf4
-rw-r--r--health/notifications/syslog/README.md2
96 files changed, 1732 insertions, 1375 deletions
diff --git a/health/Makefile.am b/health/Makefile.am
index b963ea0cd..349b86d61 100644
--- a/health/Makefile.am
+++ b/health/Makefile.am
@@ -25,9 +25,7 @@ install-exec-local:
healthconfigdir=$(libconfigdir)/health.d
dist_healthconfig_DATA = \
health.d/adaptec_raid.conf \
- health.d/am2320.conf \
health.d/anomalies.conf \
- health.d/apache.conf \
health.d/apcupsd.conf \
health.d/backend.conf \
health.d/bcache.conf \
@@ -39,18 +37,18 @@ dist_healthconfig_DATA = \
health.d/cgroups.conf \
health.d/cpu.conf \
health.d/cockroachdb.conf \
- health.d/couchdb.conf \
health.d/disks.conf \
health.d/dnsmasq_dhcp.conf \
health.d/dns_query.conf \
health.d/dockerd.conf \
- health.d/elasticsearch.conf \
health.d/entropy.conf \
health.d/exporting.conf \
health.d/fping.conf \
+ health.d/geth.conf \
health.d/ioping.conf \
health.d/fronius.conf \
health.d/gearman.conf \
+ health.d/go.d.plugin.conf \
health.d/haproxy.conf \
health.d/hdfs.conf \
health.d/httpcheck.conf \
@@ -59,26 +57,19 @@ dist_healthconfig_DATA = \
health.d/ipmi.conf \
health.d/isc_dhcpd.conf \
health.d/kubelet.conf \
- health.d/lighttpd.conf \
health.d/linux_power_supply.conf \
health.d/load.conf \
health.d/mdstat.conf \
health.d/megacli.conf \
health.d/memcached.conf \
health.d/memory.conf \
- health.d/mongodb.conf \
health.d/mysql.conf \
- health.d/named.conf \
health.d/net.conf \
health.d/netfilter.conf \
- health.d/nginx.conf \
- health.d/nginx_plus.conf \
health.d/pihole.conf \
- health.d/phpfpm.conf \
health.d/portcheck.conf \
- health.d/postgres.conf \
health.d/processes.conf \
- health.d/pulsar.conf \
+ health.d/python.d.plugin.conf \
health.d/qos.conf \
health.d/ram.conf \
health.d/redis.conf \
@@ -86,11 +77,11 @@ dist_healthconfig_DATA = \
health.d/riakkv.conf \
health.d/scaleio.conf \
health.d/softnet.conf \
- health.d/squid.conf \
health.d/stiebeleltron.conf \
health.d/synchronization.conf \
health.d/swap.conf \
health.d/systemdunits.conf \
+ health.d/timex.conf \
health.d/tcp_conn.conf \
health.d/tcp_listen.conf \
health.d/tcp_mem.conf \
@@ -98,7 +89,6 @@ dist_healthconfig_DATA = \
health.d/tcp_resets.conf \
health.d/udp_errors.conf \
health.d/unbound.conf \
- health.d/varnish.conf \
health.d/vcsa.conf \
health.d/vernemq.conf \
health.d/vsphere.conf \
@@ -107,6 +97,5 @@ dist_healthconfig_DATA = \
health.d/wmi.conf \
health.d/x509check.conf \
health.d/zfs.conf \
- health.d/zookeeper.conf \
health.d/dbengine.conf \
$(NULL)
diff --git a/health/REFERENCE.md b/health/REFERENCE.md
index 5ea6b7c5d..f1bb5557d 100644
--- a/health/REFERENCE.md
+++ b/health/REFERENCE.md
@@ -54,14 +54,17 @@ Netdata parses the following lines. Beneath the table is an in-depth explanation
- A few lines use space-separated lists to define how the entity behaves. You can use `*` as a wildcard or prefix with
`!` for a negative match. Order is important, too! See our [simple patterns docs](../libnetdata/simple_pattern/) for
more examples.
+- Lines terminated by a `\` are spliced together with the next line. The backslash is removed and the following line is
+ joined with the current one. No space is inserted, so you may split a line anywhere, even in the middle of a word.
+ This comes in handy if your `info` line consists of several sentences.
| line | required | functionality |
| --------------------------------------------------- | --------------- | ------------------------------------------------------------------------------------- |
| [`alarm`/`template`](#alarm-line-alarm-or-template) | yes | Name of the alarm/template. |
| [`on`](#alarm-line-on) | yes | The chart this alarm should attach to. |
-| [`class`](#alarm-line-class) | no | The general classification of the alarm. |
-| [`component`](#alarm-line-component) | no | Specify the component of the class of the alarm. |
-| [`type`](#alarm-line-type) | no | The type of error the alarm monitors. |
+| [`class`](#alarm-line-class) | no | The general alarm classification. |
+| [`type`](#alarm-line-type) | no | What area of the system the alarm monitors. |
+| [`component`](#alarm-line-component) | no | Specific component of the type of the alarm. |
| [`os`](#alarm-line-os) | no | Which operating systems to run this chart. |
| [`hosts`](#alarm-line-hosts) | no | Which hostnames will run this alarm. |
| [`plugin`](#alarm-line-plugin) | no | Restrict an alarm or template to only a certain plugin. |
@@ -136,24 +139,45 @@ If you create a template using the `disk.io` context, it will apply an alarm to
#### Alarm line `class`
-Specify the classification of the alarm or template.
+This indicates the type of error (or general problem area) that the alarm or template applies to. For example, `Latency` can be used for alarms that trigger on latency issues on network interfaces, web servers, or database systems. Example:
-Class can be used to indicate the broader area of the system that the alarm applies to. For example, under the general `Database` class, you can group together alarms that operate on various database systems, like `MySQL`, `CockroachDB`, `CouchDB` etc. Example:
+```yaml
+class: Latency
+```
+
+<details>
+<summary>Netdata's stock alarms use the following `class` attributes by default:</summary>
+
+| Class |
+| ----------------|
+| Errors |
+| Latency |
+| Utilization |
+| Workload |
+
+
+</details>
+
+`class` will default to `Unknown` if the line is missing from the alarm configuration.
+
+#### Alarm line `type`
+
+Type can be used to indicate the broader area of the system that the alarm applies to. For example, under the general `Database` type, you can group together alarms that operate on various database systems, like `MySQL`, `CockroachDB`, `CouchDB` etc. Example:
```yaml
-class: Database
+type: Database
```
<details>
-<summary>Netdata's stock alarms use the following `class` attributes by default, but feel free to adjust for your own requirements.</summary>
+<summary>Netdata's stock alarms use the following `type` attributes by default, but feel free to adjust for your own requirements.</summary>
-| Class | Description |
+| Type | Description |
| ------------------------ | ------------------------------------------------------------------------------------------------ |
| Ad Filtering | Services related to Ad Filtering (like pi-hole) |
| Certificates | Certificates monitoring related |
| Cgroups | Alerts for cpu and memory usage of control groups |
| Computing | Alerts for shared computing applications (e.g. boinc) |
| Containers | Container related alerts (e.g. docker instances) |
-| Database | Database systems (e.g. MySQL, Postgress, etc) |
+| Database | Database systems (e.g. MySQL, PostgreSQL, etc) |
| Data Sharing | Used to group together alerts for data sharing applications |
| DHCP | Alerts for dhcp related services |
| DNS | Alerts for dns related services |
@@ -162,7 +186,7 @@ class: Database
| Linux | Services specific to Linux (e.g. systemd) |
| Messaging | Alerts for message passing services (e.g. vernemq) |
| Netdata | Internal Netdata components monitoring |
-| Other | Use as a general class of alerts |
+| Other | When an alert doesn't fit in other types. |
| Power Supply | Alerts from power supply related services (e.g. apcupsd) |
| Search engine | Alerts for search services (e.g. elasticsearch) |
| Storage | Class for alerts dealing with storage services (storage devices typically live under `System`) |
@@ -174,26 +198,16 @@ class: Database
</details>
-If an alarm configuration is missing the `class` line, its value will default to `Unknown`.
+If an alarm configuration is missing the `type` line, its value will default to `Unknown`.
#### Alarm line `component`
-Component can be used to narrow down what the previous `class` value specifies for each alarm or template. Continuing from the previous example, `component` might include `MySQL`, `CockroachDB`, `MongoDB`, all under the same `Database` classification. Example:
+Component can be used to narrow down what the previous `type` value specifies for each alarm or template. Continuing from the previous example, `component` might include `MySQL`, `CockroachDB`, `MongoDB`, all under the same `Database` type. Example:
```yaml
component: MySQL
```
-As with the `class` line, if `component` is missing from the configuration, its value will default to `Unknown`.
-
-#### Alarm line `type`
-
-This indicates the type of error (or general problem area) that the alarm or template applies to. For example, `Latency` can be used for alarms that trigger on latency issues in network interfaces, web servers, or database systems. Example:
-
-```yaml
-type: Latency
-```
-
-`type` will also (as with `class` and `component`) default to `Unknown` if the line is missing from the alarm configuration.
+As with the `class` and `type` line, if `component` is missing from the configuration, its value will default to `Unknown`.
#### Alarm line `os`
diff --git a/health/health.c b/health/health.c
index 85d2a2458..d8e1d4b77 100644
--- a/health/health.c
+++ b/health/health.c
@@ -230,6 +230,9 @@ void health_reload(void) {
if (netdata_cloud_setting) {
aclk_single_update_enable();
aclk_alarm_reload();
+#ifdef ENABLE_NEW_CLOUD_PROTOCOL
+ aclk_alert_reloaded = 1;
+#endif
}
#endif
}
@@ -308,26 +311,44 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
int n_warn=0, n_crit=0;
RRDCALC *rc;
EVAL_EXPRESSION *expr=NULL;
+ BUFFER *warn_alarms, *crit_alarms;
+
+ warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
+ crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
for(rc = host->alarms; rc ; rc = rc->next) {
if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
continue;
- if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
- n_warn++;
- if (ae->alarm_id == rc->id)
- expr=rc->warning;
+ if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
+ if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
+ if (n_warn)
+ buffer_strcat(warn_alarms, ",");
+ buffer_strcat(warn_alarms, rc->name);
+ buffer_strcat(warn_alarms, "=");
+ buffer_snprintf(warn_alarms, 11, "%ld", rc->last_status_change);
+ n_warn++;
+ } else if (ae->alarm_id == rc->id)
+ expr = rc->warning;
} else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
- n_crit++;
- if (ae->alarm_id == rc->id)
- expr=rc->critical;
+ if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
+ if (n_crit)
+ buffer_strcat(crit_alarms, ",");
+ buffer_strcat(crit_alarms, rc->name);
+ buffer_strcat(crit_alarms, "=");
+ buffer_snprintf(crit_alarms, 11, "%ld", rc->last_status_change);
+ n_crit++;
+ } else if (ae->alarm_id == rc->id)
+ expr = rc->critical;
} else if (unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
if (ae->alarm_id == rc->id)
- expr=rc->warning;
+ expr = rc->warning;
}
}
- snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d'",
+ char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0");
+
+ snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'",
exec,
recipient,
host->registry_hostname,
@@ -352,7 +373,12 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
(expr && expr->source)?expr->source:"NOSOURCE",
(expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG",
n_warn,
- n_crit
+ n_crit,
+ buffer_tostring(warn_alarms),
+ buffer_tostring(crit_alarms),
+ ae->classification?ae->classification:"Unknown",
+ edit_command,
+ localhost->registry_hostname
);
ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
@@ -363,6 +389,10 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
enqueue_alarm_notify_in_progress(ae);
+ freez(edit_command);
+ buffer_free(warn_alarms);
+ buffer_free(crit_alarms);
+
return; //health_alarm_wait_for_execution
done:
health_alarm_log_save(host, ae);
@@ -635,6 +665,8 @@ void *health_main(void *ptr) {
int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
if(min_run_every < 1) min_run_every = 1;
+ int cleanup_sql_every_loop = 7200 / min_run_every;
+
time_t now = now_realtime_sec();
time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
@@ -689,6 +721,9 @@ void *health_main(void *ptr) {
host->health_delay_up_to = 0;
}
+ if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0))
+ sql_health_alarm_log_cleanup(host);
+
rrdhost_rdlock(host);
// the first loop is to lookup values from the db
@@ -929,7 +964,7 @@ void *health_main(void *ptr) {
if(likely(!rrdcalc_isrepeating(rc))) {
ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
+ host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
rc->delay_last,
@@ -979,7 +1014,7 @@ void *health_main(void *ptr) {
if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
rc->last_repeat = now;
ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
+ host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info,
rc->delay_last,
@@ -1003,6 +1038,14 @@ void *health_main(void *ptr) {
rrdhost_unlock(host);
}
+#ifdef ENABLE_ACLK
+#ifdef ENABLE_NEW_CLOUD_PROTOCOL
+ if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > 2) {
+ sql_queue_removed_alerts_to_aclk(host);
+ }
+#endif
+#endif
+
if (unlikely(netdata_exit))
break;
@@ -1027,8 +1070,12 @@ void *health_main(void *ptr) {
health_alarm_wait_for_execution(ae);
}
- rrd_unlock();
+#ifdef ENABLE_NEW_CLOUD_PROTOCOL
+ if (netdata_cloud_setting && unlikely(aclk_alert_reloaded))
+ aclk_alert_reloaded = 0;
+#endif
+ rrd_unlock();
if(unlikely(netdata_exit))
break;
diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf
index b067e1840..1d823addd 100644
--- a/health/health.d/adaptec_raid.conf
+++ b/health/health.d/adaptec_raid.conf
@@ -3,9 +3,9 @@
template: adaptec_raid_ld_status
on: adaptec_raid.ld_status
- class: System
+ class: Errors
+ type: System
component: RAID
- type: Errors
lookup: max -10s foreach *
units: bool
every: 10s
@@ -18,9 +18,9 @@ component: RAID
template: adaptec_raid_pd_state
on: adaptec_raid.pd_state
- class: System
+ class: Errors
+ type: System
component: RAID
- type: Errors
lookup: max -10s foreach *
units: bool
every: 10s
diff --git a/health/health.d/am2320.conf b/health/health.d/am2320.conf
deleted file mode 100644
index 4bac98fbb..000000000
--- a/health/health.d/am2320.conf
+++ /dev/null
@@ -1,15 +0,0 @@
-# make sure am2320 is sending stats
-
- template: am2320_last_collected_secs
- on: am2320.temperature
- class: Other
-component: Sensors
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf
index f27e39fc1..269ae544b 100644
--- a/health/health.d/anomalies.conf
+++ b/health/health.d/anomalies.conf
@@ -2,9 +2,9 @@
template: anomalies_anomaly_probabilities
on: anomalies.probability
- class: Netdata
+ class: Errors
+ type: Netdata
component: ML
- type: Errors
lookup: average -2m foreach *
every: 1m
warn: $this > 50
@@ -14,9 +14,9 @@ component: ML
template: anomalies_anomaly_flags
on: anomalies.anomaly
- class: Netdata
+ class: Errors
+ type: Netdata
component: ML
- type: Errors
lookup: sum -2m foreach *
every: 1m
warn: $this > 10
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
index 07b5c28c9..65f1a69ab 100644
--- a/health/health.d/apcupsd.conf
+++ b/health/health.d/apcupsd.conf
@@ -2,9 +2,9 @@
template: apcupsd_10min_ups_load
on: apcupsd.load
- class: Power Supply
+ class: Utilization
+ type: Power Supply
component: UPS
- type: Utilization
os: *
hosts: *
lookup: average -10m unaligned of percentage
@@ -20,9 +20,9 @@ component: UPS
# Fire the alarm as soon as it's going on battery (99% charge) and clear only when full.
template: apcupsd_ups_charge
on: apcupsd.charge
- class: Power Supply
+ class: Errors
+ type: Power Supply
component: UPS
- type: Errors
os: *
hosts: *
lookup: average -60s unaligned of charge
@@ -36,9 +36,9 @@ component: UPS
template: apcupsd_last_collected_secs
on: apcupsd.load
- class: Power Supply
+ class: Latency
+ type: Power Supply
component: UPS device
- type: Latency
calc: $now - $last_collected_t
every: 10s
units: seconds ago
diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf
index 948ea551a..91d469395 100644
--- a/health/health.d/backend.conf
+++ b/health/health.d/backend.conf
@@ -1,9 +1,9 @@
# Alert that backends subsystem will be disabled soon
alarm: backend_metrics_eol
on: netdata.backend_metrics
- class: Netdata
+ class: Errors
+ type: Netdata
component: Exporting engine
- type: Errors
units: boolean
calc: $now - $last_collected_t
every: 1m
@@ -16,9 +16,9 @@ component: Exporting engine
alarm: backend_last_buffering
on: netdata.backend_metrics
- class: Netdata
+ class: Latency
+ type: Netdata
component: Exporting engine
- type: Latency
calc: $now - $last_collected_t
units: seconds ago
every: 10s
@@ -30,9 +30,9 @@ component: Exporting engine
alarm: backend_metrics_sent
on: netdata.backend_metrics
- class: Netdata
+ class: Workload
+ type: Netdata
component: Exporting engine
- type: Workload
units: %
calc: abs($sent) * 100 / abs($buffered)
every: 10s
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
index d75d8e19b..49cb5ad0f 100644
--- a/health/health.d/bcache.conf
+++ b/health/health.d/bcache.conf
@@ -1,9 +1,9 @@
template: bcache_cache_errors
on: disk.bcache_cache_read_races
- class: System
+ class: Errors
+ type: System
component: Disk
- type: Errors
lookup: sum -1m unaligned absolute
units: errors
every: 1m
@@ -16,9 +16,9 @@ component: Disk
template: bcache_cache_dirty
on: disk.bcache_cache_alloc
- class: System
+ class: Utilization
+ type: System
component: Disk
- type: Utilization
calc: $dirty + $metadata + $undefined
units: %
every: 1m
diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf
index 99c754571..13ac8c182 100644
--- a/health/health.d/beanstalkd.conf
+++ b/health/health.d/beanstalkd.conf
@@ -2,9 +2,9 @@
template: beanstalk_server_buried_jobs
on: beanstalk.current_jobs
- class: Messaging
+ class: Workload
+ type: Messaging
component: Beanstalk
- type: Workload
calc: $buried
units: jobs
every: 10s
diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf
index e88f87a4f..7c09225ff 100644
--- a/health/health.d/bind_rndc.conf
+++ b/health/health.d/bind_rndc.conf
@@ -1,8 +1,8 @@
template: bind_rndc_stats_file_size
on: bind_rndc.stats_size
- class: DNS
+ class: Utilization
+ type: DNS
component: BIND
- type: Utilization
units: megabytes
every: 60
calc: $stats_size
diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf
index 8604abee9..7d7a4fdae 100644
--- a/health/health.d/boinc.conf
+++ b/health/health.d/boinc.conf
@@ -3,9 +3,9 @@
# Warn on any compute errors encountered.
template: boinc_compute_errors
on: boinc.states
- class: Computing
+ class: Errors
+ type: Computing
component: BOINC
- type: Errors
os: *
hosts: *
families: *
@@ -21,9 +21,9 @@ component: BOINC
# Warn on lots of upload errors
template: boinc_upload_errors
on: boinc.states
- class: Computing
+ class: Errors
+ type: Computing
component: BOINC
- type: Errors
os: *
hosts: *
families: *
@@ -39,9 +39,9 @@ component: BOINC
# Warn on the task queue being empty
template: boinc_total_tasks
on: boinc.tasks
- class: Computing
+ class: Utilization
+ type: Computing
component: BOINC
- type: Utilization
os: *
hosts: *
families: *
@@ -57,9 +57,9 @@ component: BOINC
# Warn on no active tasks with a non-empty queue
template: boinc_active_tasks
on: boinc.tasks
- class: Computing
+ class: Utilization
+ type: Computing
component: BOINC
- type: Utilization
os: *
hosts: *
families: *
diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf
index d3200a7ee..8d197aa8d 100644
--- a/health/health.d/btrfs.conf
+++ b/health/health.d/btrfs.conf
@@ -1,9 +1,9 @@
template: btrfs_allocated
on: btrfs.disk
- class: System
+ class: Utilization
+ type: System
component: File system
- type: Utilization
os: *
hosts: *
families: *
@@ -18,9 +18,9 @@ component: File system
template: btrfs_data
on: btrfs.data
- class: System
+ class: Utilization
+ type: System
component: File system
- type: Utilization
os: *
hosts: *
families: *
@@ -35,9 +35,9 @@ component: File system
template: btrfs_metadata
on: btrfs.metadata
- class: System
+ class: Utilization
+ type: System
component: File system
- type: Utilization
os: *
hosts: *
families: *
@@ -52,9 +52,9 @@ component: File system
template: btrfs_system
on: btrfs.system
- class: System
+ class: Utilization
+ type: System
component: File system
- type: Utilization
os: *
hosts: *
families: *
diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf
index ed8f9b4b9..1f9da25c7 100644
--- a/health/health.d/ceph.conf
+++ b/health/health.d/ceph.conf
@@ -2,9 +2,9 @@
template: ceph_cluster_space_usage
on: ceph.general_usage
- class: Storage
+ class: Utilization
+ type: Storage
component: Ceph
- type: Utilization
calc: $used * 100 / ($used + $avail)
units: %
every: 1m
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
index 068533f10..45b34806c 100644
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@@ -3,9 +3,9 @@
template: cgroup_10min_cpu_usage
on: cgroup.cpu_limit
- class: Cgroups
+ class: Utilization
+ type: Cgroups
component: CPU
- type: Utilization
os: linux
hosts: *
lookup: average -10m unaligned
@@ -19,9 +19,9 @@ component: CPU
template: cgroup_ram_in_use
on: cgroup.mem_usage
- class: Cgroups
+ class: Utilization
+ type: Cgroups
component: Memory
- type: Utilization
os: linux
hosts: *
calc: ($ram) * 100 / $memory_limit
diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf
index dccd2b064..1f227841e 100644
--- a/health/health.d/cockroachdb.conf
+++ b/health/health.d/cockroachdb.conf
@@ -1,27 +1,11 @@
-# Availability
-
- template: cockroachdb_last_collected_secs
- on: cockroachdb.live_nodes
- class: Database
-component: CockroachDB
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
-
# Capacity
template: cockroachdb_used_storage_capacity
on: cockroachdb.storage_used_capacity_percentage
- class: Database
+ class: Utilization
+ type: Database
component: CockroachDB
- type: Utilization
calc: $capacity_used_percent
units: %
every: 10s
@@ -33,9 +17,9 @@ component: CockroachDB
template: cockroachdb_used_usable_storage_capacity
on: cockroachdb.storage_used_capacity_percentage
- class: Database
+ class: Utilization
+ type: Database
component: CockroachDB
- type: Utilization
calc: $capacity_usable_used_percent
units: %
every: 10s
@@ -49,37 +33,37 @@ component: CockroachDB
template: cockroachdb_unavailable_ranges
on: cockroachdb.ranges_replication_problem
- class: Database
+ class: Errors
+ type: Database
component: CockroachDB
- type: Utilization
calc: $ranges_unavailable
units: num
every: 10s
warn: $this > 0
delay: down 15m multiplier 1.5 max 1h
- info: number of ranges with fewer live replicas than the replication target
+ info: number of ranges with fewer live replicas than needed for quorum
to: dba
- template: cockroachdb_replicas_leaders_not_leaseholders
- on: cockroachdb.replicas_leaders
- class: Database
+ template: cockroachdb_underreplicated_ranges
+ on: cockroachdb.ranges_replication_problem
+ class: Errors
+ type: Database
component: CockroachDB
- type: Utilization
- calc: $replicas_leaders_not_leaseholders
+ calc: $ranges_underreplicated
units: num
every: 10s
warn: $this > 0
delay: down 15m multiplier 1.5 max 1h
- info: number of replicas that are Raft leaders whose range lease is held by another store
+ info: number of ranges with fewer live replicas than the replication target
to: dba
# FD
template: cockroachdb_open_file_descriptors_limit
on: cockroachdb.process_file_descriptors
- class: Database
+ class: Utilization
+ type: Database
component: CockroachDB
- type: Utilization
calc: $sys_fd_open/$sys_fd_softlimit * 100
units: %
every: 10s
@@ -87,29 +71,3 @@ component: CockroachDB
delay: down 15m multiplier 1.5 max 1h
info: open file descriptors utilization (against softlimit)
to: dba
-
-# SQL
-
- template: cockroachdb_sql_active_connections
- on: cockroachdb.sql_connections
- class: Database
-component: CockroachDB
- type: Utilization
- calc: $sql_conns
- units: active connections
- every: 10s
- info: number of active SQL connections
- to: dba
-
- template: cockroachdb_sql_executed_statements_total_last_5m
- on: cockroachdb.sql_statements_total
- class: Database
-component: CockroachDB
- type: Workload
- lookup: sum -5m absolute of sql_query_count
- units: statements
- every: 10s
- warn: $this == 0 AND $cockroachdb_sql_active_connections != 0
- delay: down 15m up 30s multiplier 1.5 max 1h
- info: number of executed SQL statements in the last 5 minutes
- to: dba
diff --git a/health/health.d/couchdb.conf b/health/health.d/couchdb.conf
deleted file mode 100644
index c86c6b988..000000000
--- a/health/health.d/couchdb.conf
+++ /dev/null
@@ -1,16 +0,0 @@
-
-# make sure couchdb is running
-
- template: couchdb_last_collected_secs
- on: couchdb.request_methods
- class: Database
-component: CouchDB
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf
index d11215768..ad6952825 100644
--- a/health/health.d/cpu.conf
+++ b/health/health.d/cpu.conf
@@ -3,9 +3,9 @@
template: 10min_cpu_usage
on: system.cpu
- class: System
+ class: Utilization
+ type: System
component: CPU
- type: Utilization
os: linux
hosts: *
lookup: average -10m unaligned of user,system,softirq,irq,guest
@@ -19,9 +19,9 @@ component: CPU
template: 10min_cpu_iowait
on: system.cpu
- class: System
+ class: Utilization
+ type: System
component: CPU
- type: Utilization
os: linux
hosts: *
lookup: average -10m unaligned of iowait
@@ -35,9 +35,9 @@ component: CPU
template: 20min_steal_cpu
on: system.cpu
- class: System
+ class: Latency
+ type: System
component: CPU
- type: Latency
os: linux
hosts: *
lookup: average -20m unaligned of steal
@@ -52,9 +52,9 @@ component: CPU
## FreeBSD
template: 10min_cpu_usage
on: system.cpu
- class: System
+ class: Utilization
+ type: System
component: CPU
- type: Utilization
os: freebsd
hosts: *
lookup: average -10m unaligned of user,system,interrupt
diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf
index 79c156ab8..65c41b846 100644
--- a/health/health.d/dbengine.conf
+++ b/health/health.d/dbengine.conf
@@ -3,9 +3,9 @@
alarm: 10min_dbengine_global_fs_errors
on: netdata.dbengine_global_errors
- class: Netdata
+ class: Errors
+ type: Netdata
component: DB engine
- type: Errors
os: linux freebsd macos
hosts: *
lookup: sum -10m unaligned of fs_errors
@@ -18,9 +18,9 @@ component: DB engine
alarm: 10min_dbengine_global_io_errors
on: netdata.dbengine_global_errors
- class: Netdata
+ class: Errors
+ type: Netdata
component: DB engine
- type: Errors
os: linux freebsd macos
hosts: *
lookup: sum -10m unaligned of io_errors
@@ -33,9 +33,9 @@ component: DB engine
alarm: 10min_dbengine_global_flushing_warnings
on: netdata.dbengine_global_errors
- class: Netdata
+ class: Errors
+ type: Netdata
component: DB engine
- type: Errors
os: linux freebsd macos
hosts: *
lookup: sum -10m unaligned of pg_cache_over_half_dirty_events
@@ -49,9 +49,9 @@ component: DB engine
alarm: 10min_dbengine_global_flushing_errors
on: netdata.dbengine_long_term_page_stats
- class: Netdata
+ class: Errors
+ type: Netdata
component: DB engine
- type: Errors
os: linux freebsd macos
hosts: *
lookup: sum -10m unaligned of flushing_pressure_deletions
diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf
index 60f8faed9..5daff61a1 100644
--- a/health/health.d/disks.conf
+++ b/health/health.d/disks.conf
@@ -11,9 +11,9 @@
template: disk_space_usage
on: disk.space
- class: System
+ class: Utilization
+ type: System
component: Disk
- type: Utilization
os: linux freebsd
hosts: *
families: !/dev !/dev/* !/run !/run/* *
@@ -28,9 +28,9 @@ component: Disk
template: disk_inode_usage
on: disk.inodes
- class: System
+ class: Utilization
+ type: System
component: Disk
- type: Utilization
os: linux freebsd
hosts: *
families: !/dev !/dev/* !/run !/run/* *
@@ -136,19 +136,16 @@ component: Disk
template: 10min_disk_utilization
on: disk.util
- class: System
+ class: Utilization
+ type: System
component: Disk
- type: Utilization
os: linux freebsd
hosts: *
families: *
lookup: average -10m unaligned
units: %
every: 1m
- green: 90
- red: 98
- warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1))
- crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
+ warn: $this > 98 * (($status >= $WARNING) ? (0.7) : (1))
delay: down 15m multiplier 1.2 max 1h
info: average percentage of time $family disk was busy over the last 10 minutes
to: silent
@@ -161,19 +158,16 @@ component: Disk
template: 10min_disk_backlog
on: disk.backlog
- class: System
+ class: Latency
+ type: System
component: Disk
- type: Latency
os: linux
hosts: *
families: *
lookup: average -10m unaligned
units: ms
every: 1m
- green: 2000
- red: 5000
- warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1))
- crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
+ warn: $this > 5000 * (($status >= $WARNING) ? (0.7) : (1))
delay: down 15m multiplier 1.2 max 1h
info: average backlog size of the $family disk over the last 10 minutes
to: silent
diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf
index 1fbb2c598..ec4937c0a 100644
--- a/health/health.d/dns_query.conf
+++ b/health/health.d/dns_query.conf
@@ -3,9 +3,9 @@
template: dns_query_time_query_time
on: dns_query_time.query_time
- class: DNS
+ class: Latency
+ type: DNS
component: DNS
- type: Latency
lookup: average -10s unaligned foreach *
units: ms
every: 10s
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
index 10d139f77..010b94599 100644
--- a/health/health.d/dnsmasq_dhcp.conf
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -2,9 +2,9 @@
template: dnsmasq_dhcp_dhcp_range_utilization
on: dnsmasq_dhcp.dhcp_range_utilization
- class: DHCP
+ class: Utilization
+ type: DHCP
component: Dnsmasq
- type: Utilization
every: 10s
units: %
calc: $used
diff --git a/health/health.d/dockerd.conf b/health/health.d/dockerd.conf
index ba866f81b..220ddd664 100644
--- a/health/health.d/dockerd.conf
+++ b/health/health.d/dockerd.conf
@@ -1,8 +1,8 @@
template: docker_unhealthy_containers
on: docker.unhealthy_containers
- class: Containers
+ class: Errors
+ type: Containers
component: Docker
- type: Errors
units: unhealthy containers
every: 10s
lookup: average -10s
diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf
deleted file mode 100644
index 05d576c39..000000000
--- a/health/health.d/elasticsearch.conf
+++ /dev/null
@@ -1,15 +0,0 @@
-
-# make sure elasticsearch is running
-
- template: elasticsearch_last_collected
- on: elasticsearch.cluster_health_status
- class: Search engine
-component: Elasticsearch
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- info: number of seconds since the last successful data collection
- to: sysadmin
diff --git a/health/health.d/entropy.conf b/health/health.d/entropy.conf
index 0478fa0be..13b0fcde4 100644
--- a/health/health.d/entropy.conf
+++ b/health/health.d/entropy.conf
@@ -5,9 +5,9 @@
alarm: lowest_entropy
on: system.entropy
- class: System
+ class: Utilization
+ type: System
component: Cryptography
- type: Utilization
os: linux
hosts: *
lookup: min -5m unaligned
diff --git a/health/health.d/exporting.conf b/health/health.d/exporting.conf
index 4430f3fd8..06f398c6e 100644
--- a/health/health.d/exporting.conf
+++ b/health/health.d/exporting.conf
@@ -1,22 +1,25 @@
-template: exporting_last_buffering
-families: *
- on: exporting_data_size
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful buffering of exporting data
- to: dba
+ template: exporting_last_buffering
+ families: *
+ on: exporting_data_size
+ class: Latency
+ type: Netdata
+component: Exporting engine
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful buffering of exporting data
+ to: dba
template: exporting_metrics_sent
families: *
on: exporting_data_size
- class: Netdata
+ class: Workload
+ type: Netdata
component: Exporting engine
- type: Workload
units: %
calc: abs($sent) * 100 / abs($buffered)
every: 10s
diff --git a/health/health.d/fping.conf b/health/health.d/fping.conf
index 120fe8f28..bb22419fa 100644
--- a/health/health.d/fping.conf
+++ b/health/health.d/fping.conf
@@ -2,9 +2,9 @@
template: fping_last_collected_secs
families: *
on: fping.latency
- class: Other
+ class: Latency
+ type: Other
component: Network
- type: Latency
calc: $now - $last_collected_t
units: seconds ago
every: 10s
@@ -17,9 +17,9 @@ component: Network
template: fping_host_reachable
families: *
on: fping.latency
- class: Other
+ class: Errors
+ type: Other
component: Network
- type: Errors
calc: $average != nan
units: up/down
every: 10s
@@ -31,9 +31,9 @@ component: Network
template: fping_host_latency
families: *
on: fping.latency
- class: Other
+ class: Latency
+ type: Other
component: Network
- type: Latency
lookup: average -10s unaligned of average
units: ms
every: 10s
@@ -48,9 +48,9 @@ component: Network
template: fping_packet_loss
families: *
on: fping.quality
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
lookup: average -10m unaligned of returned
calc: 100 - $this
green: 1
diff --git a/health/health.d/fronius.conf b/health/health.d/fronius.conf
index 81aafaa60..853bd7fbc 100644
--- a/health/health.d/fronius.conf
+++ b/health/health.d/fronius.conf
@@ -1,9 +1,9 @@
template: fronius_last_collected_secs
families: *
on: fronius.power
- class: Power Supply
+ class: Latency
+ type: Power Supply
component: Solar
- type: Latency
calc: $now - $last_collected_t
every: 10s
units: seconds ago
diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf
index e2031bf2b..14010d445 100644
--- a/health/health.d/gearman.conf
+++ b/health/health.d/gearman.conf
@@ -1,24 +1,10 @@
-# make sure Gearman is running
- template: gearman_last_collected_secs
- on: gearman.total_jobs
- class: Computing
-component: Gearman
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
template: gearman_workers_queued
on: gearman.single_job
- class: Computing
+ class: Latency
+ type: Computing
component: Gearman
- type: Latency
- lookup: average -10m unaligned match-names of Queued
+ lookup: average -10m unaligned match-names of Pending
units: workers
every: 10s
warn: $this > 30000
diff --git a/health/health.d/geth.conf b/health/health.d/geth.conf
new file mode 100644
index 000000000..dd1eb4701
--- /dev/null
+++ b/health/health.d/geth.conf
@@ -0,0 +1,12 @@
+#chainhead_header is expected momenterarily to be ahead. If its considerably ahead (e.g more than 5 blocks), then the node is definitely out of sync.
+ template: geth_chainhead_diff_between_header_block
+ on: geth.chainhead
+ class: Workload
+ type: ethereum_node
+component: geth
+ every: 10s
+ calc: $chain_head_block - $chain_head_header
+ units: blocks
+ warn: $this != 0
+ crit: $this > 5
+ delay: down 1m multiplier 1.5 max 1h
diff --git a/health/health.d/nginx_plus.conf b/health/health.d/go.d.plugin.conf
index 5849a9e7e..8bf84a976 100644
--- a/health/health.d/nginx_plus.conf
+++ b/health/health.d/go.d.plugin.conf
@@ -1,11 +1,12 @@
-# make sure nginx_plus is running
+# make sure go.d.plugin data collection job is running
- template: nginx_plus_last_collected_secs
- on: nginx_plus.requests_total
- class: Web Server
-component: NGINX Plus
- type: Latency
+ template: go.d_job_last_collected_secs
+ on: netdata.go_plugin_execution_time
+ class: Error
+ type: Netdata
+component: go.d.plugin
+ module: *
calc: $now - $last_collected_t
units: seconds ago
every: 10s
@@ -14,4 +15,3 @@ component: NGINX Plus
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
to: webmaster
-
diff --git a/health/health.d/haproxy.conf b/health/health.d/haproxy.conf
index 9f6b1c577..a0ab52bca 100644
--- a/health/health.d/haproxy.conf
+++ b/health/health.d/haproxy.conf
@@ -1,8 +1,8 @@
template: haproxy_backend_server_status
on: haproxy_hs.down
- class: Web Proxy
+ class: Errors
+ type: Web Proxy
component: HAProxy
- type: Errors
units: failed servers
every: 10s
lookup: average -10s
@@ -12,25 +12,12 @@ component: HAProxy
template: haproxy_backend_status
on: haproxy_hb.down
- class: Web Proxy
+ class: Errors
+ type: Web Proxy
component: HAProxy
- type: Errors
units: failed backend
every: 10s
lookup: average -10s
crit: $this > 0
info: average number of failed haproxy backends over the last 10 seconds
to: sysadmin
-
- template: haproxy_last_collected
- on: haproxy_hb.down
- class: Web Proxy
-component: HAProxy
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- info: number of seconds since the last successful data collection
- to: sysadmin
diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf
index bd8308bed..ca8df31b9 100644
--- a/health/health.d/hdfs.conf
+++ b/health/health.d/hdfs.conf
@@ -1,28 +1,11 @@
-# make sure hdfs is running
-
- template: hdfs_last_collected_secs
- on: hdfs.heap_memory
- class: Storage
-component: HDFS
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
-
# Common
template: hdfs_capacity_usage
on: hdfs.capacity
- class: Storage
+ class: Utilization
+ type: Storage
component: HDFS
- type: Utilization
calc: ($used) * 100 / ($used + $remaining)
units: %
every: 10s
@@ -37,9 +20,9 @@ component: HDFS
template: hdfs_missing_blocks
on: hdfs.blocks
- class: Storage
+ class: Errors
+ type: Storage
component: HDFS
- type: Errors
calc: $missing
units: missing blocks
every: 10s
@@ -51,9 +34,9 @@ component: HDFS
template: hdfs_stale_nodes
on: hdfs.data_nodes
- class: Storage
+ class: Errors
+ type: Storage
component: HDFS
- type: Errors
calc: $stale
units: dead nodes
every: 10s
@@ -65,9 +48,9 @@ component: HDFS
template: hdfs_dead_nodes
on: hdfs.data_nodes
- class: Storage
+ class: Errors
+ type: Storage
component: HDFS
- type: Errors
calc: $dead
units: dead nodes
every: 10s
@@ -81,9 +64,9 @@ component: HDFS
template: hdfs_num_failed_volumes
on: hdfs.num_failed_volumes
- class: Storage
+ class: Errors
+ type: Storage
component: HDFS
- type: Errors
calc: $fsds_num_failed_volumes
units: failed volumes
every: 10s
diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf
index d4d6376a3..599c47acc 100644
--- a/health/health.d/httpcheck.conf
+++ b/health/health.d/httpcheck.conf
@@ -1,25 +1,11 @@
- template: httpcheck_last_collected_secs
- families: *
- on: httpcheck.status
- class: Other
-component: HTTP endpoint
- type: Latency
- calc: $now - $last_collected_t
- every: 10s
- units: seconds ago
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
template: httpcheck_web_service_up
families: *
on: httpcheck.status
- class: Web Server
+ class: Utilization
+ type: Web Server
component: HTTP endpoint
- type: Utilization
lookup: average -1m unaligned percentage of success
calc: ($this < 75) ? (0) : ($this)
every: 5s
@@ -30,9 +16,9 @@ component: HTTP endpoint
template: httpcheck_web_service_bad_content
families: *
on: httpcheck.status
- class: Web Server
+ class: Workload
+ type: Web Server
component: HTTP endpoint
- type: Workload
lookup: average -5m unaligned percentage of bad_content
every: 10s
units: %
@@ -46,9 +32,9 @@ component: HTTP endpoint
template: httpcheck_web_service_bad_status
families: *
on: httpcheck.status
- class: Web Server
+ class: Workload
+ type: Web Server
component: HTTP endpoint
- type: Workload
lookup: average -5m unaligned percentage of bad_status
every: 10s
units: %
@@ -62,9 +48,9 @@ component: HTTP endpoint
template: httpcheck_web_service_timeouts
families: *
on: httpcheck.status
- class: Web Server
+ class: Latency
+ type: Web Server
component: HTTP endpoint
- type: Latency
lookup: average -5m unaligned percentage of timeout
every: 10s
units: %
@@ -73,9 +59,9 @@ component: HTTP endpoint
template: httpcheck_no_web_service_connections
families: *
on: httpcheck.status
- class: Other
+ class: Errors
+ type: Other
component: HTTP endpoint
- type: Errors
lookup: average -5m unaligned percentage of no_connection
every: 10s
units: %
@@ -85,9 +71,9 @@ component: HTTP endpoint
template: httpcheck_web_service_unreachable
families: *
on: httpcheck.status
- class: Web Server
+ class: Errors
+ type: Web Server
component: HTTP endpoint
- type: Errors
calc: ($httpcheck_no_web_service_connections >= $httpcheck_web_service_timeouts) ? ($httpcheck_no_web_service_connections) : ($httpcheck_web_service_timeouts)
units: %
every: 10s
@@ -101,9 +87,9 @@ component: HTTP endpoint
template: httpcheck_1h_web_service_response_time
families: *
on: httpcheck.responsetime
- class: Other
+ class: Latency
+ type: Other
component: HTTP endpoint
- type: Latency
lookup: average -1h unaligned of time
every: 30s
units: ms
@@ -112,9 +98,9 @@ component: HTTP endpoint
template: httpcheck_web_service_slow
families: *
on: httpcheck.responsetime
- class: Web Server
+ class: Latency
+ type: Web Server
component: HTTP endpoint
- type: Latency
lookup: average -3m unaligned of time
units: ms
every: 10s
diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf
index 57ce4e866..ee4befbea 100644
--- a/health/health.d/ioping.conf
+++ b/health/health.d/ioping.conf
@@ -1,9 +1,9 @@
template: ioping_disk_latency
families: *
on: ioping.latency
- class: System
+ class: Latency
+ type: System
component: Disk
- type: Latency
lookup: average -10s unaligned of average
units: ms
every: 10s
diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf
index 6eaf7abe9..c178a410a 100644
--- a/health/health.d/ipc.conf
+++ b/health/health.d/ipc.conf
@@ -3,9 +3,9 @@
alarm: semaphores_used
on: system.ipc_semaphores
- class: System
+ class: Utilization
+ type: System
component: IPC
- type: Utilization
os: linux
hosts: *
calc: $semaphores * 100 / $ipc_semaphores_max
@@ -19,9 +19,9 @@ component: IPC
alarm: semaphore_arrays_used
on: system.ipc_semaphore_arrays
- class: System
+ class: Utilization
+ type: System
component: IPC
- type: Utilization
os: linux
hosts: *
calc: $arrays * 100 / $ipc_semaphores_arrays_max
diff --git a/health/health.d/ipfs.conf b/health/health.d/ipfs.conf
index 6268f4092..a514ddfd0 100644
--- a/health/health.d/ipfs.conf
+++ b/health/health.d/ipfs.conf
@@ -1,9 +1,9 @@
template: ipfs_datastore_usage
on: ipfs.repo_size
- class: Data Sharing
+ class: Utilization
+ type: Data Sharing
component: IPFS
- type: Utilization
calc: $size * 100 / $avail
units: %
every: 10s
diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf
index d4fdc6c79..feadba1b7 100644
--- a/health/health.d/ipmi.conf
+++ b/health/health.d/ipmi.conf
@@ -1,8 +1,8 @@
alarm: ipmi_sensors_states
on: ipmi.sensors_states
- class: System
+ class: Errors
+ type: System
component: IPMI
- type: Errors
calc: $warning + $critical
units: sensors
every: 10s
@@ -14,9 +14,9 @@ component: IPMI
alarm: ipmi_events
on: ipmi.events
- class: System
+ class: Utilization
+ type: System
component: IPMI
- type: Utilization
calc: $events
units: events
every: 10s
diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf
index 4d3c45f97..c2778cc5e 100644
--- a/health/health.d/kubelet.conf
+++ b/health/health.d/kubelet.conf
@@ -6,9 +6,9 @@
template: kubelet_node_config_error
on: k8s_kubelet.kubelet_node_config_error
- class: Kubernetes
+ class: Errors
+ type: Kubernetes
component: Kubelet
- type: Errors
calc: $kubelet_node_config_error
units: bool
every: 10s
@@ -22,9 +22,9 @@ component: Kubelet
template: kubelet_token_requests
lookup: sum -10s of token_fail_count
on: k8s_kubelet.kubelet_token_requests
- class: Kubernetes
+ class: Errors
+ type: Kubernetes
component: Kubelet
- type: Errors
units: failed requests
every: 10s
warn: $this > 0
@@ -37,9 +37,9 @@ component: Kubelet
template: kubelet_operations_error
lookup: sum -1m
on: k8s_kubelet.kubelet_operations_errors
- class: Kubernetes
+ class: Errors
+ type: Kubernetes
component: Kubelet
- type: Errors
units: errors
every: 10s
warn: $this > (($status >= $WARNING) ? (0) : (20))
@@ -64,9 +64,9 @@ component: Kubelet
template: kubelet_1m_pleg_relist_latency_quantile_05
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
- class: Kubernetes
+ class: Latency
+ type: Kubernetes
component: Kubelet
- type: Latency
lookup: average -1m unaligned of kubelet_pleg_relist_latency_05
units: microseconds
every: 10s
@@ -74,9 +74,9 @@ component: Kubelet
template: kubelet_10s_pleg_relist_latency_quantile_05
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
- class: Kubernetes
+ class: Latency
+ type: Kubernetes
component: Kubelet
- type: Latency
lookup: average -10s unaligned of kubelet_pleg_relist_latency_05
calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05))
every: 10s
@@ -92,9 +92,9 @@ component: Kubelet
template: kubelet_1m_pleg_relist_latency_quantile_09
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
- class: Kubernetes
+ class: Latency
+ type: Kubernetes
component: Kubelet
- type: Latency
lookup: average -1m unaligned of kubelet_pleg_relist_latency_09
units: microseconds
every: 10s
@@ -102,9 +102,9 @@ component: Kubelet
template: kubelet_10s_pleg_relist_latency_quantile_09
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
- class: Kubernetes
+ class: Latency
+ type: Kubernetes
component: Kubelet
- type: Latency
lookup: average -10s unaligned of kubelet_pleg_relist_latency_09
calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09))
every: 10s
@@ -120,9 +120,9 @@ component: Kubelet
template: kubelet_1m_pleg_relist_latency_quantile_099
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
- class: Kubernetes
+ class: Latency
+ type: Kubernetes
component: Kubelet
- type: Latency
lookup: average -1m unaligned of kubelet_pleg_relist_latency_099
units: microseconds
every: 10s
@@ -130,9 +130,9 @@ component: Kubelet
template: kubelet_10s_pleg_relist_latency_quantile_099
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
- class: Kubernetes
+ class: Latency
+ type: Kubernetes
component: Kubelet
- type: Latency
lookup: average -10s unaligned of kubelet_pleg_relist_latency_099
calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099))
every: 10s
diff --git a/health/health.d/lighttpd.conf b/health/health.d/lighttpd.conf
deleted file mode 100644
index 0f067549e..000000000
--- a/health/health.d/lighttpd.conf
+++ /dev/null
@@ -1,17 +0,0 @@
-
-# make sure lighttpd is running
-
- template: lighttpd_last_collected_secs
- on: lighttpd.requests
- class: Web Server
-component: Lighttpd
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf
index e28c246a3..c0bc6de8a 100644
--- a/health/health.d/linux_power_supply.conf
+++ b/health/health.d/linux_power_supply.conf
@@ -2,9 +2,9 @@
template: linux_power_supply_capacity
on: powersupply.capacity
- class: Power Supply
+ class: Utilization
+ type: Power Supply
component: Battery
- type: Utilization
calc: $capacity
units: %
every: 10s
diff --git a/health/health.d/load.conf b/health/health.d/load.conf
index e811f6ee2..0bd872f85 100644
--- a/health/health.d/load.conf
+++ b/health/health.d/load.conf
@@ -6,9 +6,9 @@
# minute, with a special case for a single CPU of setting the trigger at 2.
alarm: load_cpu_number
on: system.load
- class: System
+ class: Utilization
+ type: System
component: Load
- type: Utilization
os: linux
hosts: *
calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors )
@@ -22,9 +22,9 @@ component: Load
alarm: load_average_15
on: system.load
- class: System
+ class: Utilization
+ type: System
component: Load
- type: Utilization
os: linux
hosts: *
lookup: max -1m unaligned of load15
@@ -37,9 +37,9 @@ component: Load
alarm: load_average_5
on: system.load
- class: System
+ class: Utilization
+ type: System
component: Load
- type: Utilization
os: linux
hosts: *
lookup: max -1m unaligned of load5
@@ -52,9 +52,9 @@ component: Load
alarm: load_average_1
on: system.load
- class: System
+ class: Utilization
+ type: System
component: Load
- type: Utilization
os: linux
hosts: *
lookup: max -1m unaligned of load1
diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf
index 67483b201..cedaa000e 100644
--- a/health/health.d/mdstat.conf
+++ b/health/health.d/mdstat.conf
@@ -1,8 +1,8 @@
template: mdstat_last_collected
on: md.disks
- class: System
+ class: Latency
+ type: System
component: RAID
- type: Latency
calc: $now - $last_collected_t
units: seconds ago
every: 10s
@@ -13,9 +13,9 @@ component: RAID
template: mdstat_disks
on: md.disks
- class: System
+ class: Errors
+ type: System
component: RAID
- type: Errors
units: failed devices
every: 10s
calc: $down
@@ -26,9 +26,9 @@ component: RAID
template: mdstat_mismatch_cnt
on: md.mismatch_cnt
- class: System
+ class: Errors
+ type: System
component: RAID
- type: Errors
families: !*(raid1) !*(raid10) *
units: unsynchronized blocks
calc: $count
@@ -40,9 +40,9 @@ component: RAID
template: mdstat_nonredundant_last_collected
on: md.nonredundant
- class: System
+ class: Latency
+ type: System
component: RAID
- type: Latency
calc: $now - $last_collected_t
units: seconds ago
every: 10s
diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf
index 1b6502f62..9fbcfdb92 100644
--- a/health/health.d/megacli.conf
+++ b/health/health.d/megacli.conf
@@ -3,9 +3,9 @@
template: megacli_adapter_state
on: megacli.adapter_degraded
- class: System
+ class: Errors
+ type: System
component: RAID
- type: Errors
lookup: max -10s foreach *
units: boolean
every: 10s
@@ -18,9 +18,9 @@ component: RAID
template: megacli_pd_predictive_failures
on: megacli.pd_predictive_failure
- class: System
+ class: Errors
+ type: System
component: RAID
- type: Errors
lookup: sum -10s foreach *
units: predictive failures
every: 10s
@@ -31,9 +31,9 @@ component: RAID
template: megacli_pd_media_errors
on: megacli.pd_media_error
- class: System
+ class: Errors
+ type: System
component: RAID
- type: Errors
lookup: sum -10s foreach *
units: media errors
every: 10s
@@ -46,9 +46,9 @@ component: RAID
template: megacli_bbu_relative_charge
on: megacli.bbu_relative_charge
- class: System
+ class: Workload
+ type: System
component: RAID
- type: Workload
lookup: average -10s
units: percent
every: 10s
@@ -59,9 +59,9 @@ component: RAID
template: megacli_bbu_cycle_count
on: megacli.bbu_cycle_count
- class: System
+ class: Workload
+ type: System
component: RAID
- type: Workload
lookup: average -10s
units: cycles
every: 10s
diff --git a/health/health.d/memcached.conf b/health/health.d/memcached.conf
index f4b734c38..2a2fe4b82 100644
--- a/health/health.d/memcached.conf
+++ b/health/health.d/memcached.conf
@@ -1,28 +1,11 @@
-# make sure memcached is running
-
- template: memcached_last_collected_secs
- on: memcached.cache
- class: KV Storage
-component: Memcached
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
-
-
# detect if memcached cache is full
template: memcached_cache_memory_usage
on: memcached.cache
- class: KV Storage
+ class: Utilization
+ type: KV Storage
component: Memcached
- type: Utilization
calc: $used * 100 / ($used + $available)
units: %
every: 10s
@@ -37,9 +20,9 @@ component: Memcached
template: memcached_cache_fill_rate
on: memcached.cache
- class: KV Storage
+ class: Utilization
+ type: KV Storage
component: Memcached
- type: Utilization
lookup: min -10m at -50m unaligned of available
calc: ($this - $available) / (($now - $after) / 3600)
units: KB/hour
@@ -51,9 +34,9 @@ component: Memcached
template: memcached_out_of_cache_space_time
on: memcached.cache
- class: KV Storage
+ class: Utilization
+ type: KV Storage
component: Memcached
- type: Utilization
calc: ($memcached_cache_fill_rate > 0) ? ($available / $memcached_cache_fill_rate) : (inf)
units: hours
every: 10s
diff --git a/health/health.d/memory.conf b/health/health.d/memory.conf
index ab651315f..010cbbd7b 100644
--- a/health/health.d/memory.conf
+++ b/health/health.d/memory.conf
@@ -3,9 +3,9 @@
alarm: 1hour_ecc_memory_correctable
on: mem.ecc_ce
- class: System
+ class: Errors
+ type: System
component: Memory
- type: Errors
os: linux
hosts: *
lookup: sum -10m unaligned
@@ -18,9 +18,9 @@ component: Memory
alarm: 1hour_ecc_memory_uncorrectable
on: mem.ecc_ue
- class: System
+ class: Errors
+ type: System
component: Memory
- type: Errors
os: linux
hosts: *
lookup: sum -10m unaligned
@@ -33,9 +33,9 @@ component: Memory
alarm: 1hour_memory_hw_corrupted
on: mem.hwcorrupt
- class: System
+ class: Errors
+ type: System
component: Memory
- type: Errors
os: linux
hosts: *
calc: $HardwareCorrupted
diff --git a/health/health.d/mongodb.conf b/health/health.d/mongodb.conf
deleted file mode 100644
index 8c9bdeb6f..000000000
--- a/health/health.d/mongodb.conf
+++ /dev/null
@@ -1,16 +0,0 @@
-
-# make sure mongodb is running
-
- template: mongodb_last_collected_secs
- on: mongodb.read_operations
- class: Database
-component: MongoDB
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf
index 91860c4a7..34452d983 100644
--- a/health/health.d/mysql.conf
+++ b/health/health.d/mysql.conf
@@ -1,29 +1,11 @@
-# make sure mysql is running
-
- template: mysql_last_collected_secs
- on: mysql.queries
- class: Database
-component: MySQL
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
-
-
-# -----------------------------------------------------------------------------
# slow queries
template: mysql_10s_slow_queries
on: mysql.queries
- class: Database
+ class: Latency
+ type: Database
component: MySQL
- type: Latency
lookup: sum -10s of slow_queries
units: slow queries
every: 10s
@@ -39,9 +21,9 @@ component: MySQL
template: mysql_10s_table_locks_immediate
on: mysql.table_locks
- class: Database
+ class: Utilization
+ type: Database
component: MySQL
- type: Utilization
lookup: sum -10s absolute of immediate
units: immediate locks
every: 10s
@@ -50,9 +32,9 @@ component: MySQL
template: mysql_10s_table_locks_waited
on: mysql.table_locks
- class: Database
+ class: Latency
+ type: Database
component: MySQL
- type: Latency
lookup: sum -10s absolute of waited
units: waited locks
every: 10s
@@ -61,9 +43,9 @@ component: MySQL
template: mysql_10s_waited_locks_ratio
on: mysql.table_locks
- class: Database
+ class: Latency
+ type: Database
component: MySQL
- type: Latency
calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0
units: %
every: 10s
@@ -79,9 +61,9 @@ component: MySQL
template: mysql_connections
on: mysql.connections_active
- class: Database
+ class: Utilization
+ type: Database
component: MySQL
- type: Utilization
calc: $active * 100 / $limit
units: %
every: 10s
@@ -97,9 +79,9 @@ component: MySQL
template: mysql_replication
on: mysql.slave_status
- class: Database
+ class: Errors
+ type: Database
component: MySQL
- type: Errors
calc: ($sql_running <= 0 OR $io_running <= 0)?0:1
units: ok/failed
every: 10s
@@ -110,9 +92,9 @@ component: MySQL
template: mysql_replication_lag
on: mysql.slave_behind
- class: Database
+ class: Latency
+ type: Database
component: MySQL
- type: Errors
calc: $seconds
units: seconds
every: 10s
@@ -129,9 +111,9 @@ component: MySQL
template: mysql_galera_cluster_size_max_2m
on: mysql.galera_cluster_size
- class: Database
+ class: Utilization
+ type: Database
component: MySQL
- type: Utilization
lookup: max -2m absolute
units: nodes
every: 10s
@@ -140,9 +122,9 @@ component: MySQL
template: mysql_galera_cluster_size
on: mysql.galera_cluster_size
- class: Database
+ class: Utilization
+ type: Database
component: MySQL
- type: Utilization
calc: $nodes
units: nodes
every: 10s
@@ -156,9 +138,9 @@ component: MySQL
template: mysql_galera_cluster_state
on: mysql.galera_cluster_state
- class: Database
+ class: Errors
+ type: Database
component: MySQL
- type: Errors
calc: $state
every: 10s
warn: $this == 2 OR $this == 3
@@ -173,9 +155,9 @@ component: MySQL
template: mysql_galera_cluster_status
on: mysql.galera_cluster_status
- class: Database
+ class: Errors
+ type: Database
component: MySQL
- type: Errors
calc: $wsrep_cluster_status
every: 10s
crit: $mysql_galera_cluster_state != nan AND $this != 0
diff --git a/health/health.d/named.conf b/health/health.d/named.conf
deleted file mode 100644
index 90266df16..000000000
--- a/health/health.d/named.conf
+++ /dev/null
@@ -1,17 +0,0 @@
-
-# make sure named is running
-
- template: named_last_collected_secs
- on: named.global_queries
- class: DNS
-component: BIND
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: domainadmin
-
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
index 04219e163..028ca7b81 100644
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -6,9 +6,9 @@
template: interface_speed
on: net.net
- class: System
+ class: Latency
+ type: System
component: Network
- type: Latency
os: *
hosts: *
families: *
@@ -19,9 +19,9 @@ component: Network
template: 1m_received_traffic_overflow
on: net.net
- class: System
+ class: Workload
+ type: System
component: Network
- type: Workload
os: linux
hosts: *
families: *
@@ -36,9 +36,9 @@ component: Network
template: 1m_sent_traffic_overflow
on: net.net
- class: System
+ class: Workload
+ type: System
component: Network
- type: Workload
os: linux
hosts: *
families: *
@@ -63,9 +63,9 @@ component: Network
template: inbound_packets_dropped
on: net.drops
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
families: !net* *
@@ -76,9 +76,9 @@ component: Network
template: outbound_packets_dropped
on: net.drops
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
families: !net* *
@@ -89,14 +89,14 @@ component: Network
template: inbound_packets_dropped_ratio
on: net.packets
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
families: !net* !wl* *
lookup: sum -10m unaligned absolute of received
- calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0))
+ calc: (($inbound_packets_dropped != nan AND $this > 10000) ? ($inbound_packets_dropped * 100 / $this) : (0))
units: %
every: 1m
warn: $this >= 2
@@ -106,9 +106,9 @@ component: Network
template: outbound_packets_dropped_ratio
on: net.packets
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
families: !net* !wl* *
@@ -123,14 +123,14 @@ component: Network
template: wifi_inbound_packets_dropped_ratio
on: net.packets
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
families: wl*
lookup: sum -10m unaligned absolute of received
- calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0))
+ calc: (($inbound_packets_dropped != nan AND $this > 10000) ? ($inbound_packets_dropped * 100 / $this) : (0))
units: %
every: 1m
warn: $this >= 10
@@ -140,9 +140,9 @@ component: Network
template: wifi_outbound_packets_dropped_ratio
on: net.packets
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
families: wl*
@@ -160,9 +160,9 @@ component: Network
template: interface_inbound_errors
on: net.errors
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: freebsd
hosts: *
families: *
@@ -176,9 +176,9 @@ component: Network
template: interface_outbound_errors
on: net.errors
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: freebsd
hosts: *
families: *
@@ -200,9 +200,9 @@ component: Network
template: 10min_fifo_errors
on: net.fifo
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
families: *
@@ -225,9 +225,9 @@ component: Network
template: 1m_received_packets_rate
on: net.packets
- class: System
+ class: Workload
+ type: System
component: Network
- type: Workload
os: linux freebsd
hosts: *
families: *
@@ -238,9 +238,9 @@ component: Network
template: 10s_received_packets_storm
on: net.packets
- class: System
+ class: Workload
+ type: System
component: Network
- type: Workload
os: linux freebsd
hosts: *
families: *
diff --git a/health/health.d/netfilter.conf b/health/health.d/netfilter.conf
index 35c89caf7..7de383fa2 100644
--- a/health/health.d/netfilter.conf
+++ b/health/health.d/netfilter.conf
@@ -3,9 +3,9 @@
alarm: netfilter_conntrack_full
on: netfilter.conntrack_sockets
- class: System
+ class: Workload
+ type: System
component: Network
- type: Workload
os: linux
hosts: *
lookup: max -10s unaligned of connections
diff --git a/health/health.d/nginx.conf b/health/health.d/nginx.conf
deleted file mode 100644
index 30c738f47..000000000
--- a/health/health.d/nginx.conf
+++ /dev/null
@@ -1,17 +0,0 @@
-
-# make sure nginx is running
-
- template: nginx_last_collected_secs
- on: nginx.requests
- class: Web Server
-component: NGINX
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
diff --git a/health/health.d/phpfpm.conf b/health/health.d/phpfpm.conf
deleted file mode 100644
index fc073a944..000000000
--- a/health/health.d/phpfpm.conf
+++ /dev/null
@@ -1,17 +0,0 @@
-
-# make sure phpfpm is running
-
- template: phpfpm_last_collected_secs
- on: phpfpm.requests
- class: Web Server
-component: PHP-FPM
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
index 72622caed..2e5c1cbfd 100644
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@@ -1,45 +1,12 @@
-# Make sure Pi-hole is responding.
-
- template: pihole_last_collected_secs
- on: pihole.dns_queries_total
- class: Ad Filtering
-component: Pi-hole
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
-# Blocked DNS queries.
-
- template: pihole_blocked_queries
- on: pihole.dns_queries_percentage
- class: Ad Filtering
-component: Pi-hole
- type: Errors
- every: 10s
- units: %
- calc: $blocked
- warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
- crit: $this > ( ($status == $CRITICAL) ? ( 55 ) : ( 75 ) )
- delay: up 2m down 5m
- info: percentage of blocked dns queries over the last 24 hour
- to: sysadmin
-
-
# Blocklist last update time.
# Default update interval is a week.
template: pihole_blocklist_last_update
on: pihole.blocklist_last_update
- class: Ad Filtering
+ class: Errors
+ type: Ad Filtering
component: Pi-hole
- type: Errors
every: 10s
units: seconds
calc: $ago
@@ -52,15 +19,15 @@ component: Pi-hole
template: pihole_blocklist_gravity_file
on: pihole.blocklist_last_update
- class: Ad Filtering
+ class: Errors
+ type: Ad Filtering
component: Pi-hole
- type: Errors
every: 10s
units: boolean
calc: $file_exists
crit: $this != 1
delay: up 2m down 5m
- info: gravity.list (blocklist) file existence state (0: exists, 1: not-exists)
+ info: gravity.list (blocklist) file existence state (0: not-exists, 1: exists)
to: sysadmin
# Pi-hole's ability to block unwanted domains.
@@ -68,13 +35,13 @@ component: Pi-hole
template: pihole_status
on: pihole.unwanted_domains_blocking_status
- class: Ad Filtering
+ class: Errors
+ type: Ad Filtering
component: Pi-hole
- type: Errors
every: 10s
units: boolean
calc: $enabled
warn: $this != 1
delay: up 2m down 5m
- info: unwanted domains blocking status (0: enabled, 1: disabled)
+ info: unwanted domains blocking status (0: disabled, 1: enabled)
to: sysadmin
diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf
index b977dbb31..8cbd7729c 100644
--- a/health/health.d/portcheck.conf
+++ b/health/health.d/portcheck.conf
@@ -1,25 +1,11 @@
- template: portcheck_last_collected_secs
- families: *
- on: portcheck.status
- class: Other
-component: TCP endpoint
- type: Latency
- calc: $now - $last_collected_t
- every: 10s
- units: seconds ago
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
template: portcheck_service_reachable
families: *
on: portcheck.status
- class: Other
+ class: Workload
+ type: Other
component: TCP endpoint
- type: Workload
lookup: average -1m unaligned percentage of success
calc: ($this < 75) ? (0) : ($this)
every: 5s
@@ -30,9 +16,9 @@ component: TCP endpoint
template: portcheck_connection_timeouts
families: *
on: portcheck.status
- class: Other
+ class: Errors
+ type: Other
component: TCP endpoint
- type: Errors
lookup: average -5m unaligned percentage of timeout
every: 10s
units: %
@@ -45,9 +31,9 @@ component: TCP endpoint
template: portcheck_connection_fails
families: *
on: portcheck.status
- class: Other
+ class: Errors
+ type: Other
component: TCP endpoint
- type: Errors
lookup: average -5m unaligned percentage of no_connection,failed
every: 10s
units: %
diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf
deleted file mode 100644
index f908a802a..000000000
--- a/health/health.d/postgres.conf
+++ /dev/null
@@ -1,16 +0,0 @@
-
-# make sure postgres is running
-
- template: postgres_last_collected_secs
- on: postgres.db_stat_transactions
- class: Database
-component: PostgreSQL
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf
index b44a24c0b..2929ee3d4 100644
--- a/health/health.d/processes.conf
+++ b/health/health.d/processes.conf
@@ -2,9 +2,9 @@
alarm: active_processes
on: system.active_processes
- class: System
+ class: Workload
+ type: System
component: Processes
- type: Workload
hosts: *
calc: $active * 100 / $pidmax
units: %
diff --git a/health/health.d/pulsar.conf b/health/health.d/pulsar.conf
deleted file mode 100644
index 9903d4e38..000000000
--- a/health/health.d/pulsar.conf
+++ /dev/null
@@ -1,16 +0,0 @@
-
-# Availability
-
- template: pulsar_last_collected_secs
- on: pulsar.broker_components
- class: Messaging
-component: Pulsar
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
diff --git a/health/health.d/apache.conf b/health/health.d/python.d.plugin.conf
index c623fb880..f3abc588f 100644
--- a/health/health.d/apache.conf
+++ b/health/health.d/python.d.plugin.conf
@@ -1,11 +1,12 @@
-# make sure apache is running
+# make sure python.d.plugin data collection job is running
- template: apache_last_collected_secs
- on: apache.requests
- class: Web Server
-component: Apache
- type: Latency
+ template: python.d_job_last_collected_secs
+ on: netdata.pythond_runtime
+ class: Error
+ type: Netdata
+component: python.d.plugin
+ module: *
calc: $now - $last_collected_t
units: seconds ago
every: 10s
@@ -14,4 +15,3 @@ component: Apache
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
to: webmaster
-
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index 0e3cc29fa..6e6e3b400 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -3,9 +3,9 @@
alarm: used_ram_to_ignore
on: system.ram
- class: System
+ class: Utilization
+ type: System
component: Memory
- type: Utilization
os: linux freebsd
hosts: *
calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz - $zfs.arc_size.min)
@@ -15,13 +15,12 @@ component: Memory
alarm: ram_in_use
on: system.ram
- class: System
+ class: Utilization
+ type: System
component: Memory
- type: Utilization
os: linux
hosts: *
-# calc: $used * 100 / ($used + $cached + $free)
- calc: ($used - $used_ram_to_ignore) * 100 / ($used + $cached + $free)
+ calc: ($used - $used_ram_to_ignore) * 100 / ($used + $cached + $free + $buffers)
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (80) : (90))
@@ -32,12 +31,12 @@ component: Memory
alarm: ram_available
on: mem.available
- class: System
+ class: Utilization
+ type: System
component: Memory
- type: Utilization
os: linux
hosts: *
- calc: ($avail + $system.ram.used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
+ calc: $avail * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
units: %
every: 10s
warn: $this < (($status >= $WARNING) ? (15) : (10))
@@ -46,24 +45,25 @@ component: Memory
info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
to: sysadmin
- alarm: oom_kill
- on: mem.oom_kill
- os: linux
- hosts: *
- lookup: sum -1m unaligned
- units: kills
- every: 10s
- warn: $this > 0
- delay: down 5m
- info: number of out of memory kills in the last minute
- to: sysadmin
+ alarm: oom_kill
+ on: mem.oom_kill
+ os: linux
+ hosts: *
+ lookup: sum -30m unaligned
+ units: kills
+ every: 5m
+ warn: $this > 0
+ delay: down 10m
+host labels: _is_k8s_node = false
+ info: number of out of memory kills in the last 30 minutes
+ to: sysadmin
## FreeBSD
alarm: ram_in_use
on: system.ram
- class: System
+ class: Utilization
+ type: System
component: Memory
- type: Utilization
os: freebsd
hosts: *
calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
@@ -77,9 +77,9 @@ component: Memory
alarm: ram_available
on: system.ram
- class: System
+ class: Utilization
+ type: System
component: Memory
- type: Utilization
os: freebsd
hosts: *
calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf
index e8b289942..dfb771e8c 100644
--- a/health/health.d/redis.conf
+++ b/health/health.d/redis.conf
@@ -1,26 +1,10 @@
-# make sure redis is running
-
- template: redis_last_collected_secs
- on: redis.operations
- class: KV Storage
-component: Redis
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
-
template: redis_bgsave_broken
families: *
on: redis.bgsave_health
- class: KV Storage
+ class: Errors
+ type: KV Storage
component: Redis
- type: Errors
every: 10s
crit: $rdb_last_bgsave_status != 0
units: ok/failed
@@ -31,9 +15,9 @@ component: Redis
template: redis_bgsave_slow
families: *
on: redis.bgsave_now
- class: KV Storage
+ class: Latency
+ type: KV Storage
component: Redis
- type: Latency
every: 10s
warn: $rdb_bgsave_in_progress > 600
crit: $rdb_bgsave_in_progress > 1200
diff --git a/health/health.d/retroshare.conf b/health/health.d/retroshare.conf
index ca22e60de..14aa76b4c 100644
--- a/health/health.d/retroshare.conf
+++ b/health/health.d/retroshare.conf
@@ -1,26 +1,11 @@
-# make sure RetroShare is running
-
- template: retroshare_last_collected_secs
- on: retroshare.peers
- class: Data Sharing
-component: Retroshare
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
# make sure the DHT is fine when active
template: retroshare_dht_working
on: retroshare.dht
- class: Data Sharing
+ class: Utilization
+ type: Data Sharing
component: Retroshare
- type: Utilization
calc: $dht_size_all
units: peers
every: 1m
diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf
index b2c0e8d9c..261fd48c6 100644
--- a/health/health.d/riakkv.conf
+++ b/health/health.d/riakkv.conf
@@ -1,24 +1,10 @@
-# Ensure that Riak is running. template: riak_last_collected_secs
- template: riakkv_last_collected_secs
- on: riak.kv.throughput
- class: Database
-component: Riak KV
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
# Warn if a list keys operation is running.
template: riakkv_list_keys_active
on: riak.core.fsm_active
- class: Database
+ class: Utilization
+ type: Database
component: Riak KV
- type: Utilization
calc: $list_fsm_active
units: state machines
every: 10s
@@ -31,9 +17,9 @@ component: Riak KV
# KV GET
template: riakkv_1h_kv_get_mean_latency
on: riak.kv.latency.get
- class: Database
+ class: Latency
+ type: Database
component: Riak KV
- type: Latency
calc: $node_get_fsm_time_mean
lookup: average -1h unaligned of time
every: 30s
@@ -43,9 +29,9 @@ component: Riak KV
template: riakkv_kv_get_slow
on: riak.kv.latency.get
- class: Database
+ class: Latency
+ type: Database
component: Riak KV
- type: Latency
calc: $mean
lookup: average -3m unaligned of time
units: ms
@@ -61,9 +47,9 @@ component: Riak KV
# KV PUT
template: riakkv_1h_kv_put_mean_latency
on: riak.kv.latency.put
- class: Database
+ class: Latency
+ type: Database
component: Riak KV
- type: Latency
calc: $node_put_fsm_time_mean
lookup: average -1h unaligned of time
every: 30s
@@ -73,9 +59,9 @@ component: Riak KV
template: riakkv_kv_put_slow
on: riak.kv.latency.put
- class: Database
+ class: Latency
+ type: Database
component: Riak KV
- type: Latency
calc: $mean
lookup: average -3m unaligned of time
units: ms
@@ -95,9 +81,9 @@ component: Riak KV
# On systems observed, this is < 2000, but may grow depending on load.
template: riakkv_vm_high_process_count
on: riak.vm
- class: Database
+ class: Utilization
+ type: Database
component: Riak KV
- type: Utilization
calc: $sys_process_count
units: processes
every: 10s
diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf
index 3c0dc1168..ab110bf07 100644
--- a/health/health.d/scaleio.conf
+++ b/health/health.d/scaleio.conf
@@ -1,27 +1,11 @@
-# make sure scaleio is running
-
- template: scaleio_last_collected_secs
- on: scaleio.system_capacity_total
- class: Storage
-component: ScaleIO
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
# make sure Storage Pool capacity utilization is under limit
template: scaleio_storage_pool_capacity_utilization
on: scaleio.storage_pool_capacity_utilization
- class: Storage
+ class: Utilization
+ type: Storage
component: ScaleIO
- type: Utilization
calc: $used
units: %
every: 10s
@@ -36,9 +20,9 @@ component: ScaleIO
template: scaleio_sdc_mdm_connection_state
on: scaleio.sdc_mdm_connection_state
- class: Storage
+ class: Utilization
+ type: Storage
component: ScaleIO
- type: Utilization
calc: $connected
every: 10s
warn: $this != 1
diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf
index d8b01caff..345f87505 100644
--- a/health/health.d/softnet.conf
+++ b/health/health.d/softnet.conf
@@ -5,9 +5,9 @@
alarm: 1min_netdev_backlog_exceeded
on: system.softnet_stat
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
lookup: average -1m unaligned absolute of dropped
@@ -21,9 +21,9 @@ component: Network
alarm: 1min_netdev_budget_ran_outs
on: system.softnet_stat
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
lookup: average -1m unaligned absolute of squeezed
@@ -38,9 +38,9 @@ component: Network
alarm: 10min_netisr_backlog_exceeded
on: system.softnet_stat
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: freebsd
hosts: *
lookup: average -1m unaligned absolute of qdrops
diff --git a/health/health.d/squid.conf b/health/health.d/squid.conf
deleted file mode 100644
index 5c3d17629..000000000
--- a/health/health.d/squid.conf
+++ /dev/null
@@ -1,17 +0,0 @@
-
-# make sure squid is running
-
- template: squid_last_collected_secs
- on: squid.clients_requests
- class: Web Proxy
-component: Squid
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: proxyadmin
-
diff --git a/health/health.d/stiebeleltron.conf b/health/health.d/stiebeleltron.conf
index f793b5ed1..493c8b73a 100644
--- a/health/health.d/stiebeleltron.conf
+++ b/health/health.d/stiebeleltron.conf
@@ -1,9 +1,9 @@
template: stiebeleltron_last_collected_secs
families: *
on: stiebeleltron.heating.hc1
- class: Other
+ class: Latency
+ type: Other
component: Sensors
- type: Latency
calc: $now - $last_collected_t
every: 10s
units: seconds ago
diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf
index 5b3f89a97..03c319320 100644
--- a/health/health.d/swap.conf
+++ b/health/health.d/swap.conf
@@ -3,9 +3,9 @@
alarm: 30min_ram_swapped_out
on: system.swapio
- class: System
+ class: Workload
+ type: System
component: Memory
- type: Workload
os: linux freebsd
hosts: *
lookup: sum -30m unaligned absolute of out
@@ -20,12 +20,12 @@ component: Memory
alarm: used_swap
on: system.swap
- class: System
+ class: Utilization
+ type: System
component: Memory
- type: Utilization
os: linux freebsd
hosts: *
- calc: $used * 100 / ( $used + $free )
+ calc: ($used + $free) > 0 ? ($used * 100 / ($used + $free)) : 0
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (80) : (90))
diff --git a/health/health.d/systemdunits.conf b/health/health.d/systemdunits.conf
index cc1a8698d..38213a8db 100644
--- a/health/health.d/systemdunits.conf
+++ b/health/health.d/systemdunits.conf
@@ -4,9 +4,9 @@
## Service units
template: systemd_service_units_state
on: systemd.service_units_state
- class: Linux
+ class: Errors
+ type: Linux
component: Systemd units
- type: Errors
lookup: max -1s min2max
units: ok/failed
every: 10s
@@ -18,9 +18,9 @@ component: Systemd units
## Socket units
template: systemd_socket_units_state
on: systemd.socket_unit_state
- class: Linux
+ class: Errors
+ type: Linux
component: Systemd units
- type: Errors
lookup: max -1s min2max
units: ok/failed
every: 10s
@@ -32,9 +32,9 @@ component: Systemd units
## Target units
template: systemd_target_units_state
on: systemd.target_unit_state
- class: Linux
+ class: Errors
+ type: Linux
component: Systemd units
- type: Errors
lookup: max -1s min2max
units: ok/failed
every: 10s
@@ -46,9 +46,9 @@ component: Systemd units
## Path units
template: systemd_path_units_state
on: systemd.path_unit_state
- class: Linux
+ class: Errors
+ type: Linux
component: Systemd units
- type: Errors
lookup: max -1s min2max
units: ok/failed
every: 10s
@@ -60,9 +60,9 @@ component: Systemd units
## Device units
template: systemd_device_units_state
on: systemd.device_unit_state
- class: Linux
+ class: Errors
+ type: Linux
component: Systemd units
- type: Errors
lookup: max -1s min2max
units: ok/failed
every: 10s
@@ -74,9 +74,9 @@ component: Systemd units
## Mount units
template: systemd_mount_units_state
on: systemd.mount_unit_state
- class: Linux
+ class: Errors
+ type: Linux
component: Systemd units
- type: Errors
lookup: max -1s min2max
units: ok/failed
every: 10s
@@ -88,9 +88,9 @@ component: Systemd units
## Automount units
template: systemd_automount_units_state
on: systemd.automount_unit_state
- class: Linux
+ class: Errors
+ type: Linux
component: Systemd units
- type: Errors
lookup: max -1s min2max
units: ok/failed
every: 10s
@@ -102,9 +102,9 @@ component: Systemd units
## Swap units
template: systemd_swap_units_state
on: systemd.swap_unit_state
- class: Linux
+ class: Errors
+ type: Linux
component: Systemd units
- type: Errors
lookup: max -1s min2max
units: ok/failed
every: 10s
@@ -116,9 +116,9 @@ component: Systemd units
## Scope units
template: systemd_scope_units_state
on: systemd.scope_unit_state
- class: Linux
+ class: Errors
+ type: Linux
component: Systemd units
- type: Errors
lookup: max -1s min2max
units: ok/failed
every: 10s
@@ -130,9 +130,9 @@ component: Systemd units
## Slice units
template: systemd_slice_units_state
on: systemd.slice_unit_state
- class: Linux
+ class: Errors
+ type: Linux
component: Systemd units
- type: Errors
lookup: max -1s min2max
units: ok/failed
every: 10s
diff --git a/health/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf
index f2c5e4e5d..67b3bee53 100644
--- a/health/health.d/tcp_conn.conf
+++ b/health/health.d/tcp_conn.conf
@@ -7,9 +7,9 @@
alarm: tcp_connections
on: ipv4.tcpsock
- class: System
+ class: Workload
+ type: System
component: Network
- type: Workload
os: linux
hosts: *
calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0
diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf
index 51a0e461c..d4bcfa248 100644
--- a/health/health.d/tcp_listen.conf
+++ b/health/health.d/tcp_listen.conf
@@ -20,9 +20,9 @@
alarm: 1m_tcp_accept_queue_overflows
on: ip.tcp_accept_queue
- class: System
+ class: Workload
+ type: System
component: Network
- type: Workload
os: linux
hosts: *
lookup: average -60s unaligned absolute of ListenOverflows
@@ -38,9 +38,9 @@ component: Network
# CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842
alarm: 1m_tcp_accept_queue_drops
on: ip.tcp_accept_queue
- class: System
+ class: Workload
+ type: System
component: Network
- type: Workload
os: linux
hosts: *
lookup: average -60s unaligned absolute of ListenDrops
@@ -63,9 +63,9 @@ component: Network
alarm: 1m_tcp_syn_queue_drops
on: ip.tcp_syn_queue
- class: System
+ class: Workload
+ type: System
component: Network
- type: Workload
os: linux
hosts: *
lookup: average -60s unaligned absolute of TCPReqQFullDrop
@@ -80,9 +80,9 @@ component: Network
alarm: 1m_tcp_syn_queue_cookies
on: ip.tcp_syn_queue
- class: System
+ class: Workload
+ type: System
component: Network
- type: Workload
os: linux
hosts: *
lookup: average -60s unaligned absolute of TCPReqQFullDoCookies
diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf
index 646e5c6da..318be20ac 100644
--- a/health/health.d/tcp_mem.conf
+++ b/health/health.d/tcp_mem.conf
@@ -8,9 +8,9 @@
alarm: tcp_memory
on: ipv4.sockstat_tcp_mem
- class: System
+ class: Utilization
+ type: System
component: Network
- type: Utilization
os: linux
hosts: *
calc: ${mem} * 100 / ${tcp_mem_high}
diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf
index 6e94d67d1..cbd628da5 100644
--- a/health/health.d/tcp_orphans.conf
+++ b/health/health.d/tcp_orphans.conf
@@ -9,9 +9,9 @@
alarm: tcp_orphans
on: ipv4.sockstat_tcp_sockets
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
calc: ${orphan} * 100 / ${tcp_max_orphans}
diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf
index 41355dad6..190271e47 100644
--- a/health/health.d/tcp_resets.conf
+++ b/health/health.d/tcp_resets.conf
@@ -6,9 +6,9 @@
alarm: 1m_ipv4_tcp_resets_sent
on: ipv4.tcphandshake
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
lookup: average -1m at -10s unaligned absolute of OutRsts
@@ -18,9 +18,9 @@ component: Network
alarm: 10s_ipv4_tcp_resets_sent
on: ipv4.tcphandshake
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
lookup: average -10s unaligned absolute of OutRsts
@@ -40,9 +40,9 @@ component: Network
alarm: 1m_ipv4_tcp_resets_received
on: ipv4.tcphandshake
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux freebsd
hosts: *
lookup: average -1m at -10s unaligned absolute of AttemptFails
@@ -52,9 +52,9 @@ component: Network
alarm: 10s_ipv4_tcp_resets_received
on: ipv4.tcphandshake
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux freebsd
hosts: *
lookup: average -10s unaligned absolute of AttemptFails
diff --git a/health/health.d/timex.conf b/health/health.d/timex.conf
new file mode 100644
index 000000000..ea90c4000
--- /dev/null
+++ b/health/health.d/timex.conf
@@ -0,0 +1,17 @@
+
+# It can take several minutes before ntpd selects a server to synchronize with;
+# try checking after 17 minutes (1024 seconds).
+
+ alarm: system_clock_sync_state
+ on: system.clock_sync_state
+ os: linux
+ class: Error
+ type: System
+component: Clock
+ calc: $state
+ units: synchronization state
+ every: 10s
+ warn: $system.uptime.uptime > 17 * 60 AND $this == 0
+ delay: down 5m
+ info: the system time is not synchronized to a reliable server
+ to: silent
diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
index 342a1aedd..64f47dfa7 100644
--- a/health/health.d/udp_errors.conf
+++ b/health/health.d/udp_errors.conf
@@ -6,9 +6,9 @@
alarm: 1m_ipv4_udp_receive_buffer_errors
on: ipv4.udperrors
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux freebsd
hosts: *
lookup: average -1m unaligned absolute of RcvbufErrors
@@ -24,9 +24,9 @@ component: Network
alarm: 1m_ipv4_udp_send_buffer_errors
on: ipv4.udperrors
- class: System
+ class: Errors
+ type: System
component: Network
- type: Errors
os: linux
hosts: *
lookup: average -1m unaligned absolute of SndbufErrors
diff --git a/health/health.d/unbound.conf b/health/health.d/unbound.conf
index 1df15474f..4e8d164d2 100644
--- a/health/health.d/unbound.conf
+++ b/health/health.d/unbound.conf
@@ -1,27 +1,11 @@
-# make sure unbound is running
-
- template: unbound_last_collected_secs
- on: unbound.queries
- class: DNS
-component: Unbound
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
# make sure there is no overwritten/dropped queries in the request-list
template: unbound_request_list_overwritten
on: unbound.request_list_jostle_list
- class: DNS
+ class: Errors
+ type: DNS
component: Unbound
- type: Errors
lookup: average -60s unaligned absolute match-names of overwritten
units: queries
every: 10s
@@ -32,9 +16,9 @@ component: Unbound
template: unbound_request_list_dropped
on: unbound.request_list_jostle_list
- class: DNS
+ class: Errors
+ type: DNS
component: Unbound
- type: Errors
lookup: average -60s unaligned absolute match-names of dropped
units: queries
every: 10s
diff --git a/health/health.d/varnish.conf b/health/health.d/varnish.conf
deleted file mode 100644
index 7f3bd6c82..000000000
--- a/health/health.d/varnish.conf
+++ /dev/null
@@ -1,12 +0,0 @@
- alarm: varnish_last_collected
- on: varnish.uptime
- class: Web Proxy
-component: Varnish
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- info: number of seconds since the last successful data collection
- to: sysadmin
diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf
index 8538e488c..a9cc7ceef 100644
--- a/health/health.d/vcsa.conf
+++ b/health/health.d/vcsa.conf
@@ -1,20 +1,4 @@
-# make sure vcsa is running and responding
-
- template: vcsa_last_collected_secs
- on: vcsa.system_health
- class: Virtual Machine
-component: VMware vCenter
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
# Overall system health:
# - 0: all components are healthy.
# - 1: one or more components might become overloaded soon.
@@ -24,9 +8,9 @@ component: VMware vCenter
template: vcsa_system_health
on: vcsa.system_health
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: VMware vCenter
- type: Errors
lookup: max -10s unaligned of system
units: status
every: 10s
@@ -46,9 +30,9 @@ component: VMware vCenter
template: vcsa_swap_health
on: vcsa.components_health
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: VMware vCenter
- type: Errors
lookup: max -10s unaligned of swap
units: status
every: 10s
@@ -61,9 +45,9 @@ component: VMware vCenter
template: vcsa_storage_health
on: vcsa.components_health
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: VMware vCenter
- type: Errors
lookup: max -10s unaligned of storage
units: status
every: 10s
@@ -76,9 +60,9 @@ component: VMware vCenter
template: vcsa_mem_health
on: vcsa.components_health
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: VMware vCenter
- type: Errors
lookup: max -10s unaligned of mem
units: status
every: 10s
@@ -91,9 +75,9 @@ component: VMware vCenter
template: vcsa_load_health
on: vcsa.components_health
- class: Virtual Machine
+ class: Utilization
+ type: Virtual Machine
component: VMware vCenter
- type: Utilization
lookup: max -10s unaligned of load
units: status
every: 10s
@@ -106,9 +90,9 @@ component: VMware vCenter
template: vcsa_database_storage_health
on: vcsa.components_health
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: VMware vCenter
- type: Errors
lookup: max -10s unaligned of database_storage
units: status
every: 10s
@@ -121,9 +105,9 @@ component: VMware vCenter
template: vcsa_applmgmt_health
on: vcsa.components_health
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: VMware vCenter
- type: Errors
lookup: max -10s unaligned of applmgmt
units: status
every: 10s
@@ -143,9 +127,9 @@ component: VMware vCenter
template: vcsa_software_updates_health
on: vcsa.software_updates_health
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: VMware vCenter
- type: Errors
lookup: max -10s unaligned of software_packages
units: status
every: 10s
diff --git a/health/health.d/vernemq.conf b/health/health.d/vernemq.conf
index 737147f38..cfbe2a524 100644
--- a/health/health.d/vernemq.conf
+++ b/health/health.d/vernemq.conf
@@ -1,27 +1,11 @@
-# Availability
-
- template: vernemq_last_collected_secs
- on: vernemq.node_uptime
- class: Messaging
-component: VerneMQ
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
# Socket errors
template: vernemq_socket_errors
on: vernemq.socket_errors
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: sum -1m unaligned absolute of socket_error
units: errors
every: 1m
@@ -34,9 +18,9 @@ component: VerneMQ
template: vernemq_queue_message_drop
on: vernemq.queue_undelivered_messages
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute of queue_message_drop
units: dropped messages
every: 1m
@@ -47,9 +31,9 @@ component: VerneMQ
template: vernemq_queue_message_expired
on: vernemq.queue_undelivered_messages
- class: Messaging
+ class: Latency
+ type: Messaging
component: VerneMQ
- type: Latency
lookup: average -1m unaligned absolute of queue_message_expired
units: expired messages
every: 1m
@@ -60,9 +44,9 @@ component: VerneMQ
template: vernemq_queue_message_unhandled
on: vernemq.queue_undelivered_messages
- class: Messaging
+ class: Latency
+ type: Messaging
component: VerneMQ
- type: Latency
lookup: average -1m unaligned absolute of queue_message_unhandled
units: unhandled messages
every: 1m
@@ -75,9 +59,9 @@ component: VerneMQ
template: vernemq_average_scheduler_utilization
on: vernemq.average_scheduler_utilization
- class: Messaging
+ class: Utilization
+ type: Messaging
component: VerneMQ
- type: Utilization
lookup: average -10m unaligned
units: %
every: 1m
@@ -91,9 +75,9 @@ component: VerneMQ
template: vernemq_cluster_dropped
on: vernemq.cluster_dropped
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: sum -1m unaligned
units: KiB
every: 1m
@@ -104,9 +88,9 @@ component: VerneMQ
template: vernemq_netsplits
on: vernemq.netsplits
- class: Messaging
+ class: Workload
+ type: Messaging
component: VerneMQ
- type: Workload
lookup: sum -1m unaligned absolute of netsplit_detected
units: netsplits
every: 10s
@@ -119,9 +103,9 @@ component: VerneMQ
template: vernemq_mqtt_connack_sent_reason_unsuccessful
on: vernemq.mqtt_connack_sent_reason
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute match-names of !success,*
units: packets
every: 1m
@@ -134,9 +118,9 @@ component: VerneMQ
template: vernemq_mqtt_disconnect_received_reason_not_normal
on: vernemq.mqtt_disconnect_received_reason
- class: Messaging
+ class: Workload
+ type: Messaging
component: VerneMQ
- type: Workload
lookup: average -1m unaligned absolute match-names of !normal_disconnect,*
units: packets
every: 1m
@@ -147,9 +131,9 @@ component: VerneMQ
template: vernemq_mqtt_disconnect_sent_reason_not_normal
on: vernemq.mqtt_disconnect_sent_reason
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute match-names of !normal_disconnect,*
units: packets
every: 1m
@@ -162,9 +146,9 @@ component: VerneMQ
template: vernemq_mqtt_subscribe_error
on: vernemq.mqtt_subscribe_error
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute
units: failed ops
every: 1m
@@ -175,9 +159,9 @@ component: VerneMQ
template: vernemq_mqtt_subscribe_auth_error
on: vernemq.mqtt_subscribe_auth_error
- class: Messaging
+ class: Workload
+ type: Messaging
component: VerneMQ
- type: Workload
lookup: average -1m unaligned absolute
units: attempts
every: 1m
@@ -190,9 +174,9 @@ component: VerneMQ
template: vernemq_mqtt_unsubscribe_error
on: vernemq.mqtt_unsubscribe_error
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute
units: failed ops
every: 1m
@@ -205,9 +189,9 @@ component: VerneMQ
template: vernemq_mqtt_publish_errors
on: vernemq.mqtt_publish_errors
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute
units: failed ops
every: 1m
@@ -218,9 +202,9 @@ component: VerneMQ
template: vernemq_mqtt_publish_auth_errors
on: vernemq.mqtt_publish_auth_errors
- class: Messaging
+ class: Workload
+ type: Messaging
component: VerneMQ
- type: Workload
lookup: average -1m unaligned absolute
units: attempts
every: 1m
@@ -233,9 +217,9 @@ component: VerneMQ
template: vernemq_mqtt_puback_received_reason_unsuccessful
on: vernemq.mqtt_puback_received_reason
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute match-names of !success,*
units: packets
every: 1m
@@ -246,9 +230,9 @@ component: VerneMQ
template: vernemq_mqtt_puback_sent_reason_unsuccessful
on: vernemq.mqtt_puback_sent_reason
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute match-names of !success,*
units: packets
every: 1m
@@ -259,9 +243,9 @@ component: VerneMQ
template: vernemq_mqtt_puback_unexpected
on: vernemq.mqtt_puback_invalid_error
- class: Messaging
+ class: Workload
+ type: Messaging
component: VerneMQ
- type: Workload
lookup: average -1m unaligned absolute
units: messages
every: 1m
@@ -274,9 +258,9 @@ component: VerneMQ
template: vernemq_mqtt_pubrec_received_reason_unsuccessful
on: vernemq.mqtt_pubrec_received_reason
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute match-names of !success,*
units: packets
every: 1m
@@ -287,9 +271,9 @@ component: VerneMQ
template: vernemq_mqtt_pubrec_sent_reason_unsuccessful
on: vernemq.mqtt_pubrec_sent_reason
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute match-names of !success,*
units: packets
every: 1m
@@ -300,9 +284,9 @@ component: VerneMQ
template: vernemq_mqtt_pubrec_invalid_error
on: vernemq.mqtt_pubrec_invalid_error
- class: Messaging
+ class: Workload
+ type: Messaging
component: VerneMQ
- type: Workload
lookup: average -1m unaligned absolute
units: messages
every: 1m
@@ -315,9 +299,9 @@ component: VerneMQ
template: vernemq_mqtt_pubrel_received_reason_unsuccessful
on: vernemq.mqtt_pubrel_received_reason
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute match-names of !success,*
units: packets
every: 1m
@@ -328,9 +312,9 @@ component: VerneMQ
template: vernemq_mqtt_pubrel_sent_reason_unsuccessful
on: vernemq.mqtt_pubrel_sent_reason
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute match-names of !success,*
units: packets
every: 1m
@@ -343,9 +327,9 @@ component: VerneMQ
template: vernemq_mqtt_pubcomp_received_reason_unsuccessful
on: vernemq.mqtt_pubcomp_received_reason
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute match-names of !success,*
units: packets
every: 1m
@@ -356,9 +340,9 @@ component: VerneMQ
template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful
on: vernemq.mqtt_pubcomp_sent_reason
- class: Messaging
+ class: Errors
+ type: Messaging
component: VerneMQ
- type: Errors
lookup: average -1m unaligned absolute match-names of !success,*
units: packets
every: 1m
@@ -369,9 +353,9 @@ component: VerneMQ
template: vernemq_mqtt_pubcomp_unexpected
on: vernemq.mqtt_pubcomp_invalid_error
- class: Messaging
+ class: Workload
+ type: Messaging
component: VerneMQ
- type: Workload
lookup: average -1m unaligned absolute
units: messages
every: 1m
diff --git a/health/health.d/vsphere.conf b/health/health.d/vsphere.conf
index aee7c5cd4..d8fc899b9 100644
--- a/health/health.d/vsphere.conf
+++ b/health/health.d/vsphere.conf
@@ -6,9 +6,9 @@
template: vsphere_vm_mem_usage
on: vsphere.vm_mem_usage_percentage
- class: Virtual Machine
+ class: Utilization
+ type: Virtual Machine
component: Memory
- type: Utilization
hosts: *
calc: $used
units: %
@@ -23,9 +23,9 @@ component: Memory
template: vsphere_host_mem_usage
on: vsphere.host_mem_usage_percentage
- class: Virtual Machine
+ class: Utilization
+ type: Virtual Machine
component: Memory
- type: Utilization
hosts: *
calc: $used
units: %
@@ -39,9 +39,9 @@ component: Memory
template: vsphere_inbound_packets_errors
on: vsphere.net_errors_total
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: Network
- type: Errors
hosts: *
families: *
lookup: sum -10m unaligned absolute match-names of rx
@@ -51,9 +51,9 @@ component: Network
template: vsphere_outbound_packets_errors
on: vsphere.net_errors_total
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: Network
- type: Errors
hosts: *
families: *
lookup: sum -10m unaligned absolute match-names of tx
@@ -65,9 +65,9 @@ component: Network
template: vsphere_inbound_packets_errors_ratio
on: vsphere.net_packets_total
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: Network
- type: Errors
hosts: *
families: *
lookup: sum -10m unaligned absolute match-names of rx
@@ -81,9 +81,9 @@ component: Network
template: vsphere_outbound_packets_errors_ratio
on: vsphere.net_packets_total
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: Network
- type: Errors
hosts: *
families: *
lookup: sum -10m unaligned absolute match-names of tx
@@ -100,9 +100,9 @@ component: Network
template: vsphere_cpu_usage
on: vsphere.cpu_usage_total
- class: Virtual Machine
+ class: Utilization
+ type: Virtual Machine
component: CPU
- type: Utilization
hosts: *
lookup: average -10m unaligned match-names of used
units: %
@@ -117,9 +117,9 @@ component: CPU
template: vsphere_inbound_packets_dropped
on: vsphere.net_drops_total
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: Network
- type: Errors
hosts: *
families: *
lookup: sum -10m unaligned absolute match-names of rx
@@ -129,9 +129,9 @@ component: Network
template: vsphere_outbound_packets_dropped
on: vsphere.net_drops_total
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: Network
- type: Errors
hosts: *
families: *
lookup: sum -10m unaligned absolute match-names of tx
@@ -143,9 +143,9 @@ component: Network
template: vsphere_inbound_packets_dropped_ratio
on: vsphere.net_packets_total
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: Network
- type: Errors
hosts: *
families: *
lookup: sum -10m unaligned absolute match-names of rx
@@ -159,9 +159,9 @@ component: Network
template: vsphere_outbound_packets_dropped_ratio
on: vsphere.net_packets_total
- class: Virtual Machine
+ class: Errors
+ type: Virtual Machine
component: Network
- type: Errors
hosts: *
families: *
lookup: sum -10m unaligned absolute match-names of tx
diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf
index 127c9a9c6..454e0abef 100644
--- a/health/health.d/web_log.conf
+++ b/health/health.d/web_log.conf
@@ -1,22 +1,4 @@
-# make sure we can collect web log data
-
- template: last_collected_secs
- on: web_log.response_codes
- class: Web Server
-component: Web log
- type: Latency
- families: *
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
-
# -----------------------------------------------------------------------------
# high level response code alarms
@@ -29,9 +11,9 @@ component: Web log
template: 1m_requests
on: web_log.response_statuses
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: sum -1m unaligned
calc: ($this == 0)?(1):($this)
@@ -41,9 +23,9 @@ component: Web log
template: 1m_successful
on: web_log.response_statuses
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: sum -1m unaligned of successful_requests
calc: $this * 100 / $1m_requests
@@ -57,41 +39,39 @@ component: Web log
template: 1m_redirects
on: web_log.response_statuses
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: sum -1m unaligned of redirects
calc: $this * 100 / $1m_requests
units: %
every: 10s
warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
- crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
info: ratio of redirection HTTP requests over the last minute (3xx except 304)
to: webmaster
template: 1m_bad_requests
on: web_log.response_statuses
- class: Web Server
+ class: Errors
+ type: Web Server
component: Web log
- type: Errors
families: *
lookup: sum -1m unaligned of bad_requests
calc: $this * 100 / $1m_requests
units: %
every: 10s
warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
- crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
info: ratio of client error HTTP requests over the last minute (4xx except 401)
to: webmaster
template: 1m_internal_errors
on: web_log.response_statuses
- class: Web Server
+ class: Errors
+ type: Web Server
component: Web log
- type: Errors
families: *
lookup: sum -1m unaligned of server_errors
calc: $this * 100 / $1m_requests
@@ -114,9 +94,9 @@ component: Web log
template: 1m_total_requests
on: web_log.response_codes
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: sum -1m unaligned
calc: ($this == 0)?(1):($this)
@@ -126,9 +106,9 @@ component: Web log
template: 1m_unmatched
on: web_log.response_codes
- class: Web Server
+ class: Errors
+ type: Web Server
component: Web log
- type: Errors
families: *
lookup: sum -1m unaligned of unmatched
calc: $this * 100 / $1m_total_requests
@@ -151,9 +131,9 @@ component: Web log
template: 10m_response_time
on: web_log.response_time
- class: System
+ class: Latency
+ type: System
component: Web log
- type: Latency
families: *
lookup: average -10m unaligned of avg
units: ms
@@ -162,9 +142,9 @@ component: Web log
template: web_slow
on: web_log.response_time
- class: Web Server
+ class: Latency
+ type: Web Server
component: Web log
- type: Latency
families: *
lookup: average -1m unaligned of avg
units: ms
@@ -191,9 +171,9 @@ component: Web log
template: 5m_successful_old
on: web_log.response_statuses
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: average -5m at -5m unaligned of successful_requests
units: requests/s
@@ -202,9 +182,9 @@ component: Web log
template: 5m_successful
on: web_log.response_statuses
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: average -5m unaligned of successful_requests
units: requests/s
@@ -213,9 +193,9 @@ component: Web log
template: 5m_requests_ratio
on: web_log.response_codes
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100)
units: %
@@ -233,23 +213,6 @@ component: Web log
# ---------------------------------------------------GO-VERSION---------------------------------------------------------
-# make sure we can collect web log data
-
- template: web_log_last_collected_secs
- on: web_log.requests
- class: Web Server
-component: Web log
- type: Latency
- families: *
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
# unmatched lines
# the following alarms trigger only when there are enough data.
@@ -261,9 +224,9 @@ component: Web log
template: web_log_1m_total_requests
on: web_log.requests
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: sum -1m unaligned
calc: ($this == 0)?(1):($this)
@@ -273,9 +236,9 @@ component: Web log
template: web_log_1m_unmatched
on: web_log.excluded_requests
- class: Web Server
+ class: Errors
+ type: Web Server
component: Web log
- type: Errors
families: *
lookup: sum -1m unaligned of unmatched
calc: $this * 100 / $web_log_1m_total_requests
@@ -298,9 +261,9 @@ component: Web log
template: web_log_1m_requests
on: web_log.type_requests
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: sum -1m unaligned
calc: ($this == 0)?(1):($this)
@@ -310,9 +273,9 @@ component: Web log
template: web_log_1m_successful
on: web_log.type_requests
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: sum -1m unaligned of success
calc: $this * 100 / $web_log_1m_requests
@@ -326,41 +289,39 @@ component: Web log
template: web_log_1m_redirects
on: web_log.type_requests
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: sum -1m unaligned of redirect
calc: $this * 100 / $web_log_1m_requests
units: %
every: 10s
warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
- crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
info: ratio of redirection HTTP requests over the last minute (3xx except 304)
to: webmaster
template: web_log_1m_bad_requests
on: web_log.type_requests
- class: Web Server
+ class: Errors
+ type: Web Server
component: Web log
- type: Errors
families: *
lookup: sum -1m unaligned of bad
calc: $this * 100 / $web_log_1m_requests
units: %
every: 10s
warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
- crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
info: ratio of client error HTTP requests over the last minute (4xx except 401)
to: webmaster
template: web_log_1m_internal_errors
on: web_log.type_requests
- class: Web Server
+ class: Errors
+ type: Web Server
component: Web log
- type: Errors
families: *
lookup: sum -1m unaligned of error
calc: $this * 100 / $web_log_1m_requests
@@ -384,9 +345,9 @@ component: Web log
template: web_log_10m_response_time
on: web_log.request_processing_time
- class: System
+ class: Latency
+ type: System
component: Web log
- type: Latency
families: *
lookup: average -10m unaligned of avg
units: ms
@@ -395,9 +356,9 @@ component: Web log
template: web_log_web_slow
on: web_log.request_processing_time
- class: Web Server
+ class: Latency
+ type: Web Server
component: Web log
- type: Latency
families: *
lookup: average -1m unaligned of avg
units: ms
@@ -424,9 +385,9 @@ component: Web log
template: web_log_5m_successful_old
on: web_log.type_requests
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: average -5m at -5m unaligned of success
units: requests/s
@@ -435,9 +396,9 @@ component: Web log
template: web_log_5m_successful
on: web_log.type_requests
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
lookup: average -5m unaligned of success
units: requests/s
@@ -446,9 +407,9 @@ component: Web log
template: web_log_5m_requests_ratio
on: web_log.type_requests
- class: Web Server
+ class: Workload
+ type: Web Server
component: Web log
- type: Workload
families: *
calc: ($web_log_5m_successful_old > 0)?($web_log_5m_successful * 100 / $web_log_5m_successful_old):(100)
units: %
diff --git a/health/health.d/whoisquery.conf b/health/health.d/whoisquery.conf
index c6d3a9de0..be5eb58f9 100644
--- a/health/health.d/whoisquery.conf
+++ b/health/health.d/whoisquery.conf
@@ -1,26 +1,9 @@
-# make sure whoisquery is running
-
- template: whoisquery_last_collected_secs
- on: whoisquery.time_until_expiration
- class: Other
-component: WHOIS
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 60s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
-
template: whoisquery_days_until_expiration
on: whoisquery.time_until_expiration
- class: Other
+ class: Utilization
+ type: Other
component: WHOIS
- type: Utilization
calc: $expiry
units: seconds
every: 60s
diff --git a/health/health.d/wmi.conf b/health/health.d/wmi.conf
index 6bd4e077f..90d39ce9d 100644
--- a/health/health.d/wmi.conf
+++ b/health/health.d/wmi.conf
@@ -1,29 +1,11 @@
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-## Availability
-
- template: wmi_last_collected_secs
- on: cpu.collector_duration
- class: Windows
-component: Availability
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
## CPU
template: wmi_10min_cpu_usage
on: wmi.cpu_utilization_total
- class: Windows
+ class: Utilization
+ type: Windows
component: CPU
- type: Utilization
os: linux
hosts: *
lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt
@@ -40,9 +22,9 @@ component: CPU
template: wmi_ram_in_use
on: wmi.memory_utilization
- class: Windows
+ class: Utilization
+ type: Windows
component: Memory
- type: Utilization
os: linux
hosts: *
calc: ($used) * 100 / ($used + $available)
@@ -56,9 +38,9 @@ component: Memory
template: wmi_swap_in_use
on: wmi.memory_swap_utilization
- class: Windows
+ class: Utilization
+ type: Windows
component: Memory
- type: Utilization
os: linux
hosts: *
calc: ($used) * 100 / ($used + $available)
@@ -75,9 +57,9 @@ component: Memory
template: wmi_inbound_packets_discarded
on: wmi.net_discarded
- class: Windows
+ class: Errors
+ type: Windows
component: Network
- type: Errors
os: linux
hosts: *
families: *
@@ -91,9 +73,9 @@ component: Network
template: wmi_outbound_packets_discarded
on: wmi.net_discarded
- class: Windows
+ class: Errors
+ type: Windows
component: Network
- type: Errors
os: linux
hosts: *
families: *
@@ -107,9 +89,9 @@ component: Network
template: wmi_inbound_packets_errors
on: wmi.net_errors
- class: Windows
+ class: Errors
+ type: Windows
component: Network
- type: Errors
os: linux
hosts: *
families: *
@@ -123,9 +105,9 @@ component: Network
template: wmi_outbound_packets_errors
on: wmi.net_errors
- class: Windows
+ class: Errors
+ type: Windows
component: Network
- type: Errors
os: linux
hosts: *
families: *
@@ -142,9 +124,9 @@ component: Network
template: wmi_disk_in_use
on: wmi.logical_disk_utilization
- class: Windows
+ class: Utilization
+ type: Windows
component: Disk
- type: Utilization
os: linux
hosts: *
calc: ($used) * 100 / ($used + $free)
diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf
index 93c406b7a..fc69d0288 100644
--- a/health/health.d/x509check.conf
+++ b/health/health.d/x509check.conf
@@ -1,26 +1,9 @@
-# make sure x509check is running
-
- template: x509check_last_collected_secs
- on: x509check.time_until_expiration
- class: Certificates
-component: x509 certificates
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 60s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
-
template: x509check_days_until_expiration
on: x509check.time_until_expiration
- class: Certificates
+ class: Latency
+ type: Certificates
component: x509 certificates
- type: Latency
calc: $expiry
units: seconds
every: 60s
@@ -31,9 +14,9 @@ component: x509 certificates
template: x509check_revocation_status
on: x509check.revocation_status
- class: Certificates
+ class: Errors
+ type: Certificates
component: x509 certificates
- type: Errors
calc: $revoked
every: 60s
crit: $this != nan AND $this != 0
diff --git a/health/health.d/zfs.conf b/health/health.d/zfs.conf
index d6f5fa2fe..785838d47 100644
--- a/health/health.d/zfs.conf
+++ b/health/health.d/zfs.conf
@@ -1,9 +1,9 @@
alarm: zfs_memory_throttle
on: zfs.memory_ops
- class: System
+ class: Utilization
+ type: System
component: File system
- type: Utilization
lookup: sum -10m unaligned absolute of throttled
units: events
every: 1m
@@ -16,9 +16,9 @@ component: File system
template: zfs_pool_state_warn
on: zfspool.state
- class: System
+ class: Errors
+ type: System
component: File system
- type: Errors
calc: $degraded
units: boolean
every: 10s
@@ -29,9 +29,9 @@ component: File system
template: zfs_pool_state_crit
on: zfspool.state
- class: System
+ class: Errors
+ type: System
component: File system
- type: Errors
calc: $faulted + $unavail
units: boolean
every: 10s
diff --git a/health/health.d/zookeeper.conf b/health/health.d/zookeeper.conf
deleted file mode 100644
index 8c7d5a73d..000000000
--- a/health/health.d/zookeeper.conf
+++ /dev/null
@@ -1,17 +0,0 @@
-
-# make sure zookeeper is running
-
- template: zookeeper_last_collected_secs
- on: zookeeper.requests
- class: KV Storage
-component: ZooKeeper
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
diff --git a/health/health.h b/health/health.h
index 56331b227..09040b3a8 100644
--- a/health/health.h
+++ b/health/health.h
@@ -3,7 +3,7 @@
#ifndef NETDATA_HEALTH_H
#define NETDATA_HEALTH_H 1
-#include "../daemon/common.h"
+#include "daemon/common.h"
#define NETDATA_PLUGIN_HOOK_HEALTH \
{ \
@@ -27,6 +27,7 @@ extern unsigned int default_health_enabled;
#define HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS 0x00000040
#define HEALTH_ENTRY_FLAG_SAVED 0x10000000
+#define HEALTH_ENTRY_FLAG_ACLK_QUEUED 0x20000000
#define HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION 0x80000000
#ifndef HEALTH_LISTEN_PORT
@@ -63,6 +64,7 @@ extern ALARM_ENTRY* health_create_alarm_entry(
RRDHOST *host,
uint32_t alarm_id,
uint32_t alarm_event_id,
+ uuid_t config_hash_id,
time_t when,
const char *name,
const char *chart,
@@ -96,6 +98,8 @@ extern void *health_cmdapi_thread(void *ptr);
extern void health_label_log_save(RRDHOST *host);
+extern char *health_edit_command_from_source(const char *source);
+
extern SIMPLE_PATTERN *health_pattern_from_foreach(char *s);
#endif //NETDATA_HEALTH_H
diff --git a/health/health_config.c b/health/health_config.c
index 756023715..35234df15 100644
--- a/health/health_config.c
+++ b/health/health_config.c
@@ -473,6 +473,29 @@ static inline char *health_source_file(size_t line, const char *file) {
return strdupz(buffer);
}
+char *health_edit_command_from_source(const char *source)
+{
+ char buffer[FILENAME_MAX + 1];
+ char *temp = strdupz(source);
+ char *line_num = strchr(temp, '@');
+ char *file_no_path = strrchr(temp, '/');
+
+ if (likely(file_no_path && line_num)) {
+ *line_num = '\0';
+ snprintfz(
+ buffer,
+ FILENAME_MAX,
+ "sudo %s/edit-config health.d/%s=%s",
+ netdata_configured_user_config_dir,
+ file_no_path + 1,
+ temp);
+ } else
+ buffer[0] = '\0';
+
+ freez(temp);
+ return strdupz(buffer);
+}
+
static inline void strip_quotes(char *s) {
while(*s) {
if(*s == '\'' || *s == '"') *s = ' ';
@@ -480,6 +503,40 @@ static inline void strip_quotes(char *s) {
}
}
+static inline void alert_config_free(struct alert_config *cfg)
+{
+ freez(cfg->alarm);
+ freez(cfg->template_key);
+ freez(cfg->os);
+ freez(cfg->host);
+ freez(cfg->on);
+ freez(cfg->families);
+ freez(cfg->plugin);
+ freez(cfg->module);
+ freez(cfg->charts);
+ freez(cfg->lookup);
+ freez(cfg->calc);
+ freez(cfg->warn);
+ freez(cfg->crit);
+ freez(cfg->every);
+ freez(cfg->green);
+ freez(cfg->red);
+ freez(cfg->exec);
+ freez(cfg->to);
+ freez(cfg->units);
+ freez(cfg->info);
+ freez(cfg->classification);
+ freez(cfg->component);
+ freez(cfg->type);
+ freez(cfg->delay);
+ freez(cfg->options);
+ freez(cfg->repeat);
+ freez(cfg->host_labels);
+ freez(cfg->p_db_lookup_dimensions);
+ freez(cfg->p_db_lookup_method);
+ freez(cfg);
+}
+
static int health_readfile(const char *filename, void *data) {
RRDHOST *host = (RRDHOST *)data;
@@ -554,6 +611,7 @@ static int health_readfile(const char *filename, void *data) {
RRDCALC *rc = NULL;
RRDCALCTEMPLATE *rt = NULL;
+ struct alert_config *alert_cfg = NULL;
int ignore_this = 0;
size_t line = 0, append = 0;
@@ -603,16 +661,18 @@ static int health_readfile(const char *filename, void *data) {
if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) {
if(rc) {
- if(ignore_this || !rrdcalc_add_alarm_from_config(host, rc)) {
+ if(ignore_this || !alert_hash_and_store_config(rc->config_hash_id, alert_cfg) || !rrdcalc_add_alarm_from_config(host, rc)) {
rrdcalc_free(rc);
+ alert_config_free(alert_cfg);
}
// health_add_alarms_loop(host, rc, ignore_this) ;
}
if(rt) {
- if (ignore_this || !rrdcalctemplate_add_template_from_config(host, rt))
+ if (ignore_this || !alert_hash_and_store_config(rt->config_hash_id, alert_cfg) || !rrdcalctemplate_add_template_from_config(host, rt)) {
rrdcalctemplate_free(rt);
-
+ alert_config_free(alert_cfg);
+ }
rt = NULL;
}
@@ -629,25 +689,30 @@ static int health_readfile(const char *filename, void *data) {
rc->old_status = RRDCALC_STATUS_UNINITIALIZED;
rc->warn_repeat_every = host->health_default_warn_repeat_every;
rc->crit_repeat_every = host->health_default_crit_repeat_every;
+ alert_cfg = callocz(1, sizeof(struct alert_config));
if(rrdvar_fix_name(rc->name))
error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
+ alert_cfg->alarm = strdupz(rc->name);
ignore_this = 0;
}
else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) {
if(rc) {
// health_add_alarms_loop(host, rc, ignore_this) ;
- if(ignore_this || !rrdcalc_add_alarm_from_config(host, rc)) {
+ if(ignore_this || !alert_hash_and_store_config(rc->config_hash_id, alert_cfg) || !rrdcalc_add_alarm_from_config(host, rc)) {
rrdcalc_free(rc);
+ alert_config_free(alert_cfg);
}
rc = NULL;
}
if(rt) {
- if(ignore_this || !rrdcalctemplate_add_template_from_config(host, rt))
+ if(ignore_this || !alert_hash_and_store_config(rt->config_hash_id, alert_cfg) || !rrdcalctemplate_add_template_from_config(host, rt)) {
rrdcalctemplate_free(rt);
+ alert_config_free(alert_cfg);
+ }
}
rt = callocz(1, sizeof(RRDCALCTEMPLATE));
@@ -659,14 +724,17 @@ static int health_readfile(const char *filename, void *data) {
rt->delay_multiplier = 1.0;
rt->warn_repeat_every = host->health_default_warn_repeat_every;
rt->crit_repeat_every = host->health_default_crit_repeat_every;
+ alert_cfg = callocz(1, sizeof(struct alert_config));
if(rrdvar_fix_name(rt->name))
error("Health configuration renamed template '%s' to '%s'", value, rt->name);
+ alert_cfg->template_key = strdupz(rt->name);
ignore_this = 0;
}
else if(hash == hash_os && !strcasecmp(key, HEALTH_OS_KEY)) {
char *os_match = value;
+ if (alert_cfg) alert_cfg->os = strdupz(value);
SIMPLE_PATTERN *os_pattern = simple_pattern_create(os_match, NULL, SIMPLE_PATTERN_EXACT);
if(!simple_pattern_matches(os_pattern, host->os)) {
@@ -683,6 +751,7 @@ static int health_readfile(const char *filename, void *data) {
}
else if(hash == hash_host && !strcasecmp(key, HEALTH_HOST_KEY)) {
char *host_match = value;
+ if (alert_cfg) alert_cfg->host = strdupz(value);
SIMPLE_PATTERN *host_pattern = simple_pattern_create(host_match, NULL, SIMPLE_PATTERN_EXACT);
if(!simple_pattern_matches(host_pattern, host->hostname)) {
@@ -699,6 +768,7 @@ static int health_readfile(const char *filename, void *data) {
}
else if(rc) {
if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
+ alert_cfg->on = strdupz(value);
if(rc->chart) {
if(strcmp(rc->chart, value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
@@ -710,6 +780,7 @@ static int health_readfile(const char *filename, void *data) {
rc->hash_chart = simple_hash(rc->chart);
}
else if(hash == hash_class && !strcasecmp(key, HEALTH_CLASS_KEY)) {
+ alert_cfg->classification = strdupz(value);
if(rc->classification) {
if(strcmp(rc->classification, value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
@@ -721,6 +792,7 @@ static int health_readfile(const char *filename, void *data) {
strip_quotes(rc->classification);
}
else if(hash == hash_component && !strcasecmp(key, HEALTH_COMPONENT_KEY)) {
+ alert_cfg->component = strdupz(value);
if(rc->component) {
if(strcmp(rc->component, value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
@@ -732,6 +804,7 @@ static int health_readfile(const char *filename, void *data) {
strip_quotes(rc->component);
}
else if(hash == hash_type && !strcasecmp(key, HEALTH_TYPE_KEY)) {
+ alert_cfg->type = strdupz(value);
if(rc->type) {
if(strcmp(rc->type, value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
@@ -743,18 +816,32 @@ static int health_readfile(const char *filename, void *data) {
strip_quotes(rc->type);
}
else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
+ alert_cfg->lookup = strdupz(value);
health_parse_db_lookup(line, filename, value, &rc->group, &rc->after, &rc->before,
&rc->update_every, &rc->options, &rc->dimensions, &rc->foreachdim);
if(rc->foreachdim) {
rc->spdim = health_pattern_from_foreach(rc->foreachdim);
}
+ if (rc->after) {
+ if (rc->dimensions)
+ alert_cfg->p_db_lookup_dimensions = strdupz(rc->dimensions);
+ if (rc->group)
+ alert_cfg->p_db_lookup_method = strdupz(group_method2string(rc->group));
+ alert_cfg->p_db_lookup_options = rc->options;
+ alert_cfg->p_db_lookup_after = rc->after;
+ alert_cfg->p_db_lookup_before = rc->before;
+ alert_cfg->p_update_every = rc->update_every;
+ }
}
else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
+ alert_cfg->every = strdupz(value);
if(!config_parse_duration(value, &rc->update_every))
error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
line, filename, rc->name, key, value);
+ alert_cfg->p_update_every = rc->update_every;
}
else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
+ alert_cfg->green = strdupz(value);
char *e;
rc->green = str2ld(value, &e);
if(e && *e) {
@@ -763,6 +850,7 @@ static int health_readfile(const char *filename, void *data) {
}
}
else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
+ alert_cfg->red = strdupz(value);
char *e;
rc->red = str2ld(value, &e);
if(e && *e) {
@@ -771,6 +859,7 @@ static int health_readfile(const char *filename, void *data) {
}
}
else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
+ alert_cfg->calc = strdupz(value);
const char *failed_at = NULL;
int error = 0;
rc->calculation = expression_parse(value, &failed_at, &error);
@@ -780,6 +869,7 @@ static int health_readfile(const char *filename, void *data) {
}
}
else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
+ alert_cfg->warn = strdupz(value);
const char *failed_at = NULL;
int error = 0;
rc->warning = expression_parse(value, &failed_at, &error);
@@ -789,6 +879,7 @@ static int health_readfile(const char *filename, void *data) {
}
}
else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
+ alert_cfg->crit = strdupz(value);
const char *failed_at = NULL;
int error = 0;
rc->critical = expression_parse(value, &failed_at, &error);
@@ -798,6 +889,7 @@ static int health_readfile(const char *filename, void *data) {
}
}
else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
+ alert_cfg->exec = strdupz(value);
if(rc->exec) {
if(strcmp(rc->exec, value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
@@ -808,6 +900,7 @@ static int health_readfile(const char *filename, void *data) {
rc->exec = strdupz(value);
}
else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
+ alert_cfg->to = strdupz(value);
if(rc->recipient) {
if(strcmp(rc->recipient, value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
@@ -818,6 +911,7 @@ static int health_readfile(const char *filename, void *data) {
rc->recipient = strdupz(value);
}
else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
+ alert_cfg->units = strdupz(value);
if(rc->units) {
if(strcmp(rc->units, value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
@@ -829,6 +923,7 @@ static int health_readfile(const char *filename, void *data) {
strip_quotes(rc->units);
}
else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
+ alert_cfg->info = strdupz(value);
if(rc->info) {
if(strcmp(rc->info, value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
@@ -840,17 +935,21 @@ static int health_readfile(const char *filename, void *data) {
strip_quotes(rc->info);
}
else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
+ alert_cfg->delay = strdupz(value);
health_parse_delay(line, filename, value, &rc->delay_up_duration, &rc->delay_down_duration, &rc->delay_max_duration, &rc->delay_multiplier);
}
else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) {
+ alert_cfg->options = strdupz(value);
rc->options |= health_parse_options(value);
}
else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){
+ alert_cfg->repeat = strdupz(value);
health_parse_repeat(line, filename, value,
&rc->warn_repeat_every,
&rc->crit_repeat_every);
}
else if(hash == hash_host_label && !strcasecmp(key, HEALTH_HOST_LABEL_KEY)) {
+ alert_cfg->host_labels = strdupz(value);
if(rc->labels) {
if(strcmp(rc->labels, value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'.",
@@ -864,6 +963,7 @@ static int health_readfile(const char *filename, void *data) {
rc->splabels = simple_pattern_create(rc->labels, NULL, SIMPLE_PATTERN_EXACT);
}
else if(hash == hash_plugin && !strcasecmp(key, HEALTH_PLUGIN_KEY)) {
+ alert_cfg->plugin = strdupz(value);
freez(rc->plugin_match);
simple_pattern_free(rc->plugin_pattern);
@@ -871,6 +971,7 @@ static int health_readfile(const char *filename, void *data) {
rc->plugin_pattern = simple_pattern_create(rc->plugin_match, NULL, SIMPLE_PATTERN_EXACT);
}
else if(hash == hash_module && !strcasecmp(key, HEALTH_MODULE_KEY)) {
+ alert_cfg->module = strdupz(value);
freez(rc->module_match);
simple_pattern_free(rc->module_pattern);
@@ -884,6 +985,7 @@ static int health_readfile(const char *filename, void *data) {
}
else if(rt) {
if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
+ alert_cfg->on = strdupz(value);
if(rt->context) {
if(strcmp(rt->context, value) != 0)
error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
@@ -895,6 +997,7 @@ static int health_readfile(const char *filename, void *data) {
rt->hash_context = simple_hash(rt->context);
}
else if(hash == hash_class && !strcasecmp(key, HEALTH_CLASS_KEY)) {
+ alert_cfg->classification = strdupz(value);
if(rt->classification) {
if(strcmp(rt->classification, value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
@@ -906,6 +1009,7 @@ static int health_readfile(const char *filename, void *data) {
strip_quotes(rt->classification);
}
else if(hash == hash_component && !strcasecmp(key, HEALTH_COMPONENT_KEY)) {
+ alert_cfg->component = strdupz(value);
if(rt->component) {
if(strcmp(rt->component, value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
@@ -917,6 +1021,7 @@ static int health_readfile(const char *filename, void *data) {
strip_quotes(rt->component);
}
else if(hash == hash_type && !strcasecmp(key, HEALTH_TYPE_KEY)) {
+ alert_cfg->type = strdupz(value);
if(rt->type) {
if(strcmp(rt->type, value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
@@ -928,6 +1033,7 @@ static int health_readfile(const char *filename, void *data) {
strip_quotes(rt->type);
}
else if(hash == hash_families && !strcasecmp(key, HEALTH_FAMILIES_KEY)) {
+ alert_cfg->families = strdupz(value);
freez(rt->family_match);
simple_pattern_free(rt->family_pattern);
@@ -935,6 +1041,7 @@ static int health_readfile(const char *filename, void *data) {
rt->family_pattern = simple_pattern_create(rt->family_match, NULL, SIMPLE_PATTERN_EXACT);
}
else if(hash == hash_plugin && !strcasecmp(key, HEALTH_PLUGIN_KEY)) {
+ alert_cfg->plugin = strdupz(value);
freez(rt->plugin_match);
simple_pattern_free(rt->plugin_pattern);
@@ -942,6 +1049,7 @@ static int health_readfile(const char *filename, void *data) {
rt->plugin_pattern = simple_pattern_create(rt->plugin_match, NULL, SIMPLE_PATTERN_EXACT);
}
else if(hash == hash_module && !strcasecmp(key, HEALTH_MODULE_KEY)) {
+ alert_cfg->module = strdupz(value);
freez(rt->module_match);
simple_pattern_free(rt->module_pattern);
@@ -949,6 +1057,7 @@ static int health_readfile(const char *filename, void *data) {
rt->module_pattern = simple_pattern_create(rt->module_match, NULL, SIMPLE_PATTERN_EXACT);
}
else if(hash == hash_charts && !strcasecmp(key, HEALTH_CHARTS_KEY)) {
+ alert_cfg->charts = strdupz(value);
freez(rt->charts_match);
simple_pattern_free(rt->charts_pattern);
@@ -956,18 +1065,32 @@ static int health_readfile(const char *filename, void *data) {
rt->charts_pattern = simple_pattern_create(rt->charts_match, NULL, SIMPLE_PATTERN_EXACT);
}
else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
+ alert_cfg->lookup = strdupz(value);
health_parse_db_lookup(line, filename, value, &rt->group, &rt->after, &rt->before,
&rt->update_every, &rt->options, &rt->dimensions, &rt->foreachdim);
if(rt->foreachdim) {
rt->spdim = health_pattern_from_foreach(rt->foreachdim);
}
+ if (rt->after) {
+ if (rt->dimensions)
+ alert_cfg->p_db_lookup_dimensions = strdupz(rt->dimensions);
+ if (rt->group)
+ alert_cfg->p_db_lookup_method = strdupz(group_method2string(rt->group));
+ alert_cfg->p_db_lookup_options = rt->options;
+ alert_cfg->p_db_lookup_after = rt->after;
+ alert_cfg->p_db_lookup_before = rt->before;
+ alert_cfg->p_update_every = rt->update_every;
+ }
}
else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
+ alert_cfg->every = strdupz(value);
if(!config_parse_duration(value, &rt->update_every))
error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
line, filename, rt->name, key, value);
+ alert_cfg->p_update_every = rt->update_every;
}
else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
+ alert_cfg->green = strdupz(value);
char *e;
rt->green = str2ld(value, &e);
if(e && *e) {
@@ -976,6 +1099,7 @@ static int health_readfile(const char *filename, void *data) {
}
}
else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
+ alert_cfg->red = strdupz(value);
char *e;
rt->red = str2ld(value, &e);
if(e && *e) {
@@ -984,6 +1108,7 @@ static int health_readfile(const char *filename, void *data) {
}
}
else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
+ alert_cfg->calc = strdupz(value);
const char *failed_at = NULL;
int error = 0;
rt->calculation = expression_parse(value, &failed_at, &error);
@@ -993,6 +1118,7 @@ static int health_readfile(const char *filename, void *data) {
}
}
else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
+ alert_cfg->warn = strdupz(value);
const char *failed_at = NULL;
int error = 0;
rt->warning = expression_parse(value, &failed_at, &error);
@@ -1002,6 +1128,7 @@ static int health_readfile(const char *filename, void *data) {
}
}
else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
+ alert_cfg->crit = strdupz(value);
const char *failed_at = NULL;
int error = 0;
rt->critical = expression_parse(value, &failed_at, &error);
@@ -1011,6 +1138,7 @@ static int health_readfile(const char *filename, void *data) {
}
}
else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
+ alert_cfg->exec = strdupz(value);
if(rt->exec) {
if(strcmp(rt->exec, value) != 0)
error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
@@ -1021,6 +1149,7 @@ static int health_readfile(const char *filename, void *data) {
rt->exec = strdupz(value);
}
else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
+ alert_cfg->to = strdupz(value);
if(rt->recipient) {
if(strcmp(rt->recipient, value) != 0)
error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
@@ -1031,6 +1160,7 @@ static int health_readfile(const char *filename, void *data) {
rt->recipient = strdupz(value);
}
else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
+ alert_cfg->units = strdupz(value);
if(rt->units) {
if(strcmp(rt->units, value) != 0)
error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
@@ -1042,6 +1172,7 @@ static int health_readfile(const char *filename, void *data) {
strip_quotes(rt->units);
}
else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
+ alert_cfg->info = strdupz(value);
if(rt->info) {
if(strcmp(rt->info, value) != 0)
error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
@@ -1053,17 +1184,21 @@ static int health_readfile(const char *filename, void *data) {
strip_quotes(rt->info);
}
else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
+ alert_cfg->delay = strdupz(value);
health_parse_delay(line, filename, value, &rt->delay_up_duration, &rt->delay_down_duration, &rt->delay_max_duration, &rt->delay_multiplier);
}
else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) {
+ alert_cfg->options = strdupz(value);
rt->options |= health_parse_options(value);
}
else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){
+ alert_cfg->repeat = strdupz(value);
health_parse_repeat(line, filename, value,
&rt->warn_repeat_every,
&rt->crit_repeat_every);
}
else if(hash == hash_host_label && !strcasecmp(key, HEALTH_HOST_LABEL_KEY)) {
+ alert_cfg->host_labels = strdupz(value);
if(rt->labels) {
if(strcmp(rt->labels, value) != 0)
error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
@@ -1089,16 +1224,20 @@ static int health_readfile(const char *filename, void *data) {
if(rc) {
//health_add_alarms_loop(host, rc, ignore_this) ;
- if(ignore_this || !rrdcalc_add_alarm_from_config(host, rc)) {
+ if(ignore_this || !alert_hash_and_store_config(rc->config_hash_id, alert_cfg) || !rrdcalc_add_alarm_from_config(host, rc)) {
rrdcalc_free(rc);
}
}
if(rt) {
- if(ignore_this || !rrdcalctemplate_add_template_from_config(host, rt))
+ if(ignore_this || !alert_hash_and_store_config(rt->config_hash_id, alert_cfg) || !rrdcalctemplate_add_template_from_config(host, rt)) {
rrdcalctemplate_free(rt);
+ }
}
+ if (alert_cfg)
+ alert_config_free(alert_cfg);
+
fclose(fp);
return 1;
}
diff --git a/health/health_json.c b/health/health_json.c
index 4df44611c..a21d5a4fd 100644
--- a/health/health_json.c
+++ b/health/health_json.c
@@ -14,12 +14,19 @@ void health_string2json(BUFFER *wb, const char *prefix, const char *label, const
}
void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
+ char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0");
+ char config_hash_id[GUID_LEN + 1];
+ uuid_unparse_lower(ae->config_hash_id, config_hash_id);
+
buffer_sprintf(wb,
"\n\t{\n"
"\t\t\"hostname\": \"%s\",\n"
+ "\t\t\"utc_offset\": %d,\n"
+ "\t\t\"timezone\": \"%s\",\n"
"\t\t\"unique_id\": %u,\n"
"\t\t\"alarm_id\": %u,\n"
"\t\t\"alarm_event_id\": %u,\n"
+ "\t\t\"config_hash_id\": \"%s\",\n"
"\t\t\"name\": \"%s\",\n"
"\t\t\"chart\": \"%s\",\n"
"\t\t\"family\": \"%s\",\n"
@@ -34,6 +41,7 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host)
"\t\t\"recipient\": \"%s\",\n"
"\t\t\"exec_code\": %d,\n"
"\t\t\"source\": \"%s\",\n"
+ "\t\t\"command\": \"%s\",\n"
"\t\t\"units\": \"%s\",\n"
"\t\t\"when\": %lu,\n"
"\t\t\"duration\": %lu,\n"
@@ -49,9 +57,12 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host)
"\t\t\"last_repeat\": \"%lu\",\n"
"\t\t\"silenced\": \"%s\",\n"
, host->hostname
+ , host->utc_offset
+ , host->abbrev_timezone
, ae->unique_id
, ae->alarm_id
, ae->alarm_event_id
+ , config_hash_id
, ae->name
, ae->chart
, ae->family
@@ -66,6 +77,7 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host)
, ae->recipient?ae->recipient:host->health_default_recipient
, ae->exec_code
, ae->source
+ , edit_command
, ae->units?ae->units:""
, (unsigned long)ae->when
, (unsigned long)ae->duration
@@ -114,6 +126,7 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host)
buffer_strcat(wb, "\t}");
freez(replaced_info);
+ freez(edit_command);
}
void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart) {
@@ -178,9 +191,13 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
}
}
+ char hash_id[GUID_LEN + 1];
+ uuid_unparse_lower(rc->config_hash_id, hash_id);
+
buffer_sprintf(wb,
"\t\t\"%s.%s\": {\n"
"\t\t\t\"id\": %lu,\n"
+ "\t\t\t\"config_hash_id\": \"%s\",\n"
"\t\t\t\"name\": \"%s\",\n"
"\t\t\t\"chart\": \"%s\",\n"
"\t\t\t\"family\": \"%s\",\n"
@@ -212,6 +229,7 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
"\t\t\t\"last_repeat\": \"%lu\",\n"
, rc->chart, rc->name
, (unsigned long)rc->id
+ , hash_id
, rc->name
, rc->chart
, (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
diff --git a/health/health_log.c b/health/health_log.c
index de0a0883b..d20085d9e 100644
--- a/health/health_log.c
+++ b/health/health_log.c
@@ -38,39 +38,41 @@ static inline void health_log_rotate(RRDHOST *host) {
}
if(unlikely(host->health_log_entries_written > rotate_every)) {
- health_alarm_log_close(host);
+ if(unlikely(host->health_log_fp)) {
+ health_alarm_log_close(host);
- char old_filename[FILENAME_MAX + 1];
- snprintfz(old_filename, FILENAME_MAX, "%s.old", host->health_log_filename);
+ char old_filename[FILENAME_MAX + 1];
+ snprintfz(old_filename, FILENAME_MAX, "%s.old", host->health_log_filename);
- if(unlink(old_filename) == -1 && errno != ENOENT)
- error("HEALTH [%s]: cannot remove old alarms log file '%s'", host->hostname, old_filename);
+ if(unlink(old_filename) == -1 && errno != ENOENT)
+ error("HEALTH [%s]: cannot remove old alarms log file '%s'", host->hostname, old_filename);
- if(link(host->health_log_filename, old_filename) == -1 && errno != ENOENT)
- error("HEALTH [%s]: cannot move file '%s' to '%s'.", host->hostname, host->health_log_filename, old_filename);
+ if(link(host->health_log_filename, old_filename) == -1 && errno != ENOENT)
+ error("HEALTH [%s]: cannot move file '%s' to '%s'.", host->hostname, host->health_log_filename, old_filename);
- if(unlink(host->health_log_filename) == -1 && errno != ENOENT)
- error("HEALTH [%s]: cannot remove old alarms log file '%s'", host->hostname, host->health_log_filename);
+ if(unlink(host->health_log_filename) == -1 && errno != ENOENT)
+ error("HEALTH [%s]: cannot remove old alarms log file '%s'", host->hostname, host->health_log_filename);
- // open it with truncate
- host->health_log_fp = fopen(host->health_log_filename, "w");
+ // open it with truncate
+ host->health_log_fp = fopen(host->health_log_filename, "w");
- if(host->health_log_fp)
- fclose(host->health_log_fp);
- else
- error("HEALTH [%s]: cannot truncate health log '%s'", host->hostname, host->health_log_filename);
+ if(host->health_log_fp)
+ fclose(host->health_log_fp);
+ else
+ error("HEALTH [%s]: cannot truncate health log '%s'", host->hostname, host->health_log_filename);
- host->health_log_fp = NULL;
+ host->health_log_fp = NULL;
- host->health_log_entries_written = 0;
- health_alarm_log_open(host);
+ host->health_log_entries_written = 0;
+ health_alarm_log_open(host);
+ }
}
}
inline void health_label_log_save(RRDHOST *host) {
health_log_rotate(host);
- if(likely(host->health_log_fp)) {
+ if(unlikely(host->health_log_fp)) {
BUFFER *wb = buffer_create(1024);
rrdhost_check_rdlock(host);
netdata_rwlock_rdlock(&host->labels.labels_rwlock);
@@ -101,7 +103,7 @@ inline void health_label_log_save(RRDHOST *host) {
inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
health_log_rotate(host);
- if(likely(host->health_log_fp)) {
+ if(unlikely(host->health_log_fp)) {
if(unlikely(fprintf(host->health_log_fp
, "%c\t%s"
"\t%08x\t%08x\t%08x\t%08x\t%08x"
@@ -155,13 +157,12 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
host->health_log_entries_written++;
}
- }
+ }else
+ sql_health_alarm_log_save(host, ae);
+
#ifdef ENABLE_ACLK
if (netdata_cloud_setting) {
- if ((ae->new_status == RRDCALC_STATUS_WARNING || ae->new_status == RRDCALC_STATUS_CRITICAL) ||
- ((ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL))) {
- aclk_update_alarm(host, ae);
- }
+ sql_queue_alarm_to_aclk(host, ae);
}
#endif
}
@@ -368,7 +369,7 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
ae->last_repeat = last_repeat;
- if (likely(entries > 28)) {
+ if (likely(entries > 30)) {
freez(ae->classification);
ae->classification = strdupz(pointers[28]);
if(!*ae->classification) { freez(ae->classification); ae->classification = NULL; }
@@ -392,9 +393,13 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
if(unlikely(*pointers[0] == 'A')) {
ae->next = host->health_log.alarms;
host->health_log.alarms = ae;
+ sql_health_alarm_log_insert(host, ae);
loaded++;
}
- else updated++;
+ else {
+ sql_health_alarm_log_update(host, ae);
+ updated++;
+ }
if(unlikely(ae->unique_id > host->health_max_unique_id))
host->health_max_unique_id = ae->unique_id;
@@ -444,8 +449,6 @@ inline void health_alarm_log_load(RRDHOST *host) {
health_alarm_log_read(host, fp, host->health_log_filename);
fclose(fp);
}
-
- health_alarm_log_open(host);
}
@@ -456,6 +459,7 @@ inline ALARM_ENTRY* health_create_alarm_entry(
RRDHOST *host,
uint32_t alarm_id,
uint32_t alarm_event_id,
+ uuid_t config_hash_id,
time_t when,
const char *name,
const char *chart,
@@ -487,6 +491,8 @@ inline ALARM_ENTRY* health_create_alarm_entry(
ae->hash_chart = simple_hash(ae->chart);
}
+ uuid_copy(ae->config_hash_id, *((uuid_t *) config_hash_id));
+
if(family)
ae->family = strdupz(family);
diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in
index 9a3a80ad6..08a32ff10 100755
--- a/health/notifications/alarm-notify.sh.in
+++ b/health/notifications/alarm-notify.sh.in
@@ -239,6 +239,11 @@ else
calc_param_values="${22}" # the values of the parameters in the expression, at the time of the evaluation
total_warnings="${23}" # Total number of alarms in WARNING state
total_critical="${24}" # Total number of alarms in CRITICAL state
+ total_warn_alarms="${25}" # List of alarms in warning state
+ total_crit_alarms="${26}" # List of alarms in critical state
+ classification="${27}" # The class field from .conf files
+ edit_command_line="${28}" # The command to edit the alarm, with the line number
+ sender_host="${29}" # The host sending this notification
fi
# -----------------------------------------------------------------------------
@@ -253,6 +258,17 @@ else
fi
# -----------------------------------------------------------------------------
+# Do the same for sender_host (find a suitable hostname to use, if netdata did not supply a hostname)
+
+if [ -z ${sender_host} ]; then
+ this_host=$(hostname -s 2>/dev/null)
+ s_host="${this_host}"
+ sender_host="${this_host}"
+else
+ s_host="${sender_host}"
+fi
+
+# -----------------------------------------------------------------------------
# screen statuses we don't need to send a notification
# don't do anything if this is not WARNING, CRITICAL or CLEAR
@@ -303,7 +319,7 @@ SLACK_WEBHOOK_URL=
# Microsoft Teams configs
MSTEAMS_WEBHOOK_URL=
-# Legacy Microsoft Teams configs for backwards compatability:
+# Legacy Microsoft Teams configs for backwards compatibility:
declare -A role_recipients_msteam
# rocketchat configs
@@ -810,6 +826,14 @@ date=$(date --date=@${when} "${date_format}" 2>/dev/null)
[ -z "${date}" ] && date=$(date --date=@${when} 2>/dev/null)
[ -z "${date}" ] && date=$(date 2>/dev/null)
+# -----------------------------------------------------------------------------
+# get the date in utc the alarm happened
+
+date_utc=$(date --date=@${when} "${date_format}" -u 2>/dev/null)
+[ -z "${date_utc}" ] && date_utc=$(date -u "${date_format}" 2>/dev/null)
+[ -z "${date_utc}" ] && date_utc=$(date -u --date=@${when} 2>/dev/null)
+[ -z "${date_utc}" ] && date_utc=$(date -u 2>/dev/null)
+
# ----------------------------------------------------------------------------
# prepare some extra headers if we've been asked to thread e-mails
if [ "${SEND_EMAIL}" == "YES" ] && [ "${EMAIL_THREADING}" != "NO" ]; then
@@ -915,7 +939,7 @@ send_email() {
fi
[ -n "${sender_email}" ] && opts+=(-f "${sender_email}")
- [ -n "${sender_name}" ] && sendmail --help 2>&1 | grep -q "\-F " && opts+=(-F "${sender_name}")
+ [ -n "${sender_name}" ] && ${sendmail} -F 2>&1 | head -1 | grep -qv "sendmail: unrecognized option: F" && opts+=(-F "${sender_name}")
if [ "${debug}" = "1" ]; then
echo >&2 "--- BEGIN sendmail command ---"
@@ -1364,15 +1388,15 @@ EOF
)"
# Replacing in the webhook CHANNEL string by the MS Teams channel name from conf file.
- webhook="${webhook//CHANNEL/${channel}}"
+ cur_webhook="${webhook//CHANNEL/${channel}}"
- httpcode=$(docurl -H "Content-Type: application/json" -d "${payload}" "${webhook}")
+ httpcode=$(docurl -H "Content-Type: application/json" -d "${payload}" "${cur_webhook}")
if [ "${httpcode}" = "200" ]; then
- info "sent Microsoft team notification for: ${host} ${chart}.${name} is ${status} to '${webhook}'"
+ info "sent Microsoft team notification for: ${host} ${chart}.${name} is ${status} to '${cur_webhook}'"
sent=$((sent + 1))
else
- error "failed to send Microsoft team notification for: ${host} ${chart}.${name} is ${status} to '${webhook}', with HTTP response status code ${httpcode}."
+ error "failed to send Microsoft team notification for: ${host} ${chart}.${name} is ${status} to '${cur_webhook}', with HTTP response status code ${httpcode}."
fi
done
@@ -2113,12 +2137,12 @@ send_dynatrace() {
[ "${SEND_DYNATRACE}" != "YES" ] && return 1
local dynatrace_url="${DYNATRACE_SERVER}/e/${DYNATRACE_SPACE}/api/v1/events"
- local description="NetData Notification for: ${host} ${chart}.${name} is ${status}"
+ local description="Netdata Notification for: ${host} ${chart}.${name} is ${status}"
local payload=""
payload=$(cat <<EOF
{
- "title": "NetData Alarm from ${host}",
+ "title": "Netdata Alarm from ${host}",
"source" : "${DYNATRACE_ANNOTATION_TYPE}",
"description" : "${description}",
"eventType": "${DYNATRACE_EVENT}",
@@ -2266,8 +2290,10 @@ urlencode "${family}" >/dev/null
url_family="${REPLY}"
urlencode "${name}" >/dev/null
url_name="${REPLY}"
+urlencode "${value_string}" >/dev/null
+url_value_string="${REPLY}"
-redirect_params="host=${url_host}&chart=${url_chart}&family=${url_family}&alarm=${url_name}&alarm_unique_id=${unique_id}&alarm_id=${alarm_id}&alarm_event_id=${event_id}&alarm_when=${when}"
+redirect_params="host=${url_host}&chart=${url_chart}&family=${url_family}&alarm=${url_name}&alarm_unique_id=${unique_id}&alarm_id=${alarm_id}&alarm_event_id=${event_id}&alarm_when=${when}&alarm_status=${status}&alarm_chart=${chart}&alarm_value=${url_value_string}"
GOTOCLOUD=0
if [ "${NETDATA_REGISTRY_URL}" == "https://registry.my-netdata.io" ]; then
@@ -2284,9 +2310,9 @@ fi
if [ ${GOTOCLOUD} -eq 0 ]; then
goto_url="${NETDATA_REGISTRY_URL}/goto-host-from-alarm.html?${redirect_params}"
else
- # Temporarily disable alarm redirection, as the cloud endpoint no longer exists. This functionality will be restored after discussion on #9487. For now, just lead to netdata.cloud
- #goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentID=${NETDATA_REGISTRY_UNIQUE_ID}&${redirect_params}"
- goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}"
+ # Temporarily disable alarm redirection, as the cloud endpoint no longer exists. This functionality will be restored after discussion on #9487. For now, just lead to netdata.cloud
+ # Re-allow alarm redirection, for alarms 2.0, new template
+ goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentId=${NETDATA_REGISTRY_UNIQUE_ID}&${redirect_params}"
fi
# the severity of the alarm
@@ -2311,48 +2337,79 @@ alarm="${name//_/ } = ${value_string}"
# the image of the alarm
image="${images_base_url}/images/banner-icon-144x144.png"
+# have a default email status, in case the following case does not catch it
+status_email_subject="${status}"
+
# prepare the title based on status
case "${status}" in
CRITICAL)
image="${images_base_url}/images/alert-128-red.png"
+ alarm_badge="${NETDATA_REGISTRY_CLOUD_BASE_URL}/static/email/img/label_critical.png"
status_message="is critical"
+ status_email_subject="Critical"
color="#ca414b"
+ rich_status_raised_for="Raised to critical, for ${non_clear_duration_txt}"
+ background_color="#FFEBEF"
+ border_color="#FF4136"
+ text_color="#FF4136"
+ action_text_color="#FFFFFF"
;;
WARNING)
image="${images_base_url}/images/alert-128-orange.png"
+ alarm_badge="${NETDATA_REGISTRY_CLOUD_BASE_URL}/static/email/img/label_warning.png"
status_message="needs attention"
+ status_email_subject="Warning"
color="#ffc107"
+ rich_status_raised_for="Raised to warning, for ${non_clear_duration_txt}"
+ background_color="#FFF8E1"
+ border_color="#FFC300"
+ text_color="#536775"
+ action_text_color="#35414A"
;;
CLEAR)
image="${images_base_url}/images/check-mark-2-128-green.png"
+ alarm_badge="${NETDATA_REGISTRY_CLOUD_BASE_URL}/static/email/img/label_recovered.png"
status_message="recovered"
+ status_email_subject="Clear"
color="#77ca6d"
+ rich_status_raised_for=
+ background_color="#E5F5E8"
+ border_color="#68C47D"
+ text_color="#00AB44"
+ action_text_color="#FFFFFF"
;;
esac
+# the html email subject
+html_email_subject="${status_email_subject}, ${name} = ${value_string}, on ${host}"
+
if [ "${status}" = "CLEAR" ]; then
severity="Recovered from ${old_status}"
if [ ${non_clear_duration} -gt ${duration} ]; then
raised_for="(alarm was raised for ${non_clear_duration_txt})"
fi
+ rich_status_raised_for="Recovered from ${old_status,,}, ${raised_for}"
# don't show the value when the status is CLEAR
# for certain alarms, this value might not have any meaning
alarm="${name//_/ } ${raised_for}"
+ html_email_subject="${status_email_subject}, ${name} ${raised_for}, on ${host}"
elif { [ "${old_status}" = "WARNING" ] && [ "${status}" = "CRITICAL" ]; }; then
severity="Escalated to ${status}"
if [ ${non_clear_duration} -gt ${duration} ]; then
raised_for="(alarm is raised for ${non_clear_duration_txt})"
fi
+ rich_status_raised_for="Escalated to critical, ${raised_for}"
elif { [ "${old_status}" = "CRITICAL" ] && [ "${status}" = "WARNING" ]; }; then
severity="Demoted to ${status}"
if [ ${non_clear_duration} -gt ${duration} ]; then
raised_for="(alarm is raised for ${non_clear_duration_txt})"
fi
+ rich_status_raised_for="Demoted to warning, ${raised_for}"
else
raised_for=
@@ -2628,6 +2685,13 @@ Subject: ${host} ${status_message} - ${name//_/ } - ${chart}
MIME-Version: 1.0
Content-Type: multipart/alternative; boundary="multipart-boundary"
${email_thread_headers}
+X-Netdata-Severity: ${status,,}
+X-Netdata-Alert-Name: $name
+X-Netdata-Chart: $chart
+X-Netdata-Family: $family
+X-Netdata-Classification: $classification
+X-Netdata-Host: $host
+X-Netdata-Role: $roles
This is a MIME-encoded multipart message
@@ -2638,120 +2702,742 @@ EOF
else
+now=$(date "+%s")
+
+if [ -n "$total_warn_alarms" ]; then
+ while read -d, -r pair; do
+ IFS='=' read -r key val <<<"$pair"
+
+ date_w=$(date --date=@${val} "${date_format}" 2>/dev/null)
+ [ -z "${date_w}" ] && date_w=$(date "${date_format}" 2>/dev/null)
+ [ -z "${date_w}" ] && date_w=$(date --date=@${val} 2>/dev/null)
+ [ -z "${date_w}" ] && date_w=$(date 2>/dev/null)
+
+ elapsed=$((now - val))
+
+ duration4human ${elapsed} >/dev/null
+ elapsed_txt="${REPLY}"
+
+ WARN_ALARMS+="
+ <div class=\"set-font\" style=\"font-family: 'IBM Plex Sans', sans-serif; background: #FFFFFF; background-color: #FFFFFF; margin: 0px auto; max-width: 600px;\">
+ <table align=\"center\" border=\"0\" cellpadding=\"0\" cellspacing=\"0\" role=\"presentation\" style=\"background:#FFFFFF;background-color:#FFFFFF;width:100%;\">
+ <tbody>
+ <tr>
+ <td style=\"border-top:8px solid #F7F8F8;direction:ltr;font-size:0px;padding:20px 0;text-align:center;\">
+ <!--[if mso | IE]><table role=\"presentation\" border=\"0\" cellpadding=\"0\" cellspacing=\"0\"><tr><td class=\"\" style=\"vertical-align:top;width:300px;\" ><![endif]-->
+ <div class=\"mj-column-per-50 mj-outlook-group-fix\" style=\"font-size:0px;text-align:left;direction:ltr;display:inline-block;vertical-align:top;width:50%;\">
+ <table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" role=\"presentation\" style=\"vertical-align:top;\" width=\"100%\">
+ <tbody>
+ <tr>
+ <td align=\"left\" style=\"font-size:0px;padding:10px 25px;word-break:break-word;\">
+ <div style=\"font-family:Open Sans, sans-serif;font-size:14px;font-weight:600;line-height:1;text-align:left;color:#35414A;\">${key}</div>
+ </td>
+ </tr>
+ <tr>
+ <td align=\"left\" style=\"font-size:0px;padding:10px 25px;padding-top:2px;word-break:break-word;\">
+ <div style=\"font-family:Open Sans, sans-serif;font-size:12px;line-height:1;text-align:left;color:#35414A;\">${date_w}</div>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td><td class=\"\" style=\"vertical-align:top;width:300px;\" ><![endif]-->
+ <div class=\"mj-column-per-50 mj-outlook-group-fix\" style=\"font-size:0px;text-align:left;direction:ltr;display:inline-block;vertical-align:top;width:50%;\">
+ <table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" role=\"presentation\" width=\"100%\">
+ <tbody>
+ <tr>
+ <td style=\"vertical-align:top;padding-top:13px;\">
+ <table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" role=\"presentation\" style width=\"100%\">
+ <tbody>
+ <tr>
+ <td align=\"right\" style=\"font-size:0px;padding:10px 25px;word-break:break-word;\">
+ <div style=\"font-family:Open Sans, sans-serif;font-size:13px;line-height:1;text-align:right;color:#555555;\"><span style=\"background-color:#FFF8E1; border: 1px solid #FFC300; border-radius:36px; padding: 2px 12px; margin-top: 20px; white-space: nowrap\">
+ Warning for ${elapsed_txt}
+ </span></div>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table><![endif]-->
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ "
+
+ done <<<"$total_warn_alarms,"
+fi
+
+if [ -n "$total_crit_alarms" ]; then
+ while read -d, -r pair; do
+ IFS='=' read -r key val <<<"$pair"
+
+ date_c=$(date --date=@${val} "${date_format}" 2>/dev/null)
+ [ -z "${date_c}" ] && date_c=$(date "${date_format}" 2>/dev/null)
+ [ -z "${date_c}" ] && date_c=$(date --date=@${val} 2>/dev/null)
+ [ -z "${date_c}" ] && date_c=$(date 2>/dev/null)
+
+ elapsed=$((now - val))
+
+ duration4human ${elapsed} >/dev/null
+ elapsed_txt="${REPLY}"
+
+ CRIT_ALARMS+="
+ <div class=\"set-font\" style=\"font-family: 'IBM Plex Sans', sans-serif; background: #FFFFFF; background-color: #FFFFFF; margin: 0px auto; max-width: 600px;\">
+ <table align=\"center\" border=\"0\" cellpadding=\"0\" cellspacing=\"0\" role=\"presentation\" style=\"background:#FFFFFF;background-color:#FFFFFF;width:100%;\">
+ <tbody>
+ <tr>
+ <td style=\"border-top:8px solid #F7F8F8;direction:ltr;font-size:0px;padding:20px 0;text-align:center;\">
+ <!--[if mso | IE]><table role=\"presentation\" border=\"0\" cellpadding=\"0\" cellspacing=\"0\"><tr><td class=\"\" style=\"vertical-align:top;width:300px;\" ><![endif]-->
+ <div class=\"mj-column-per-50 mj-outlook-group-fix\" style=\"font-size:0px;text-align:left;direction:ltr;display:inline-block;vertical-align:top;width:50%;\">
+ <table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" role=\"presentation\" style=\"vertical-align:top;\" width=\"100%\">
+ <tbody>
+ <tr>
+ <td align=\"left\" style=\"font-size:0px;padding:10px 25px;word-break:break-word;\">
+ <div style=\"font-family:Open Sans, sans-serif;font-size:14px;font-weight:600;line-height:1;text-align:left;color:#35414A;\">${key}</div>
+ </td>
+ </tr>
+ <tr>
+ <td align=\"left\" style=\"font-size:0px;padding:10px 25px;padding-top:2px;word-break:break-word;\">
+ <div style=\"font-family:Open Sans, sans-serif;font-size:12px;line-height:1;text-align:left;color:#35414A;\">${date_c}</div>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td><td class=\"\" style=\"vertical-align:top;width:300px;\" ><![endif]-->
+ <div class=\"mj-column-per-50 mj-outlook-group-fix\" style=\"font-size:0px;text-align:left;direction:ltr;display:inline-block;vertical-align:top;width:50%;\">
+ <table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" role=\"presentation\" width=\"100%\">
+ <tbody>
+ <tr>
+ <td style=\"vertical-align:top;padding-top:13px;\">
+ <table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" role=\"presentation\" style width=\"100%\">
+ <tbody>
+ <tr>
+ <td align=\"right\" style=\"font-size:0px;padding:10px 25px;word-break:break-word;\">
+ <div style=\"font-family:Open Sans, sans-serif;font-size:13px;line-height:1;text-align:right;color:#35414A;\"><span style=\"background-color:#FFEBEF; border: 1px solid #FF4136; border-radius:36px; padding: 2px 12px; margin-top: 20px; white-space: nowrap\">
+ Critical for ${elapsed_txt}
+ </span></div>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table><![endif]-->
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ "
+
+ done <<<"$total_crit_alarms,"
+fi
+
+if [ -n "$edit_command_line" ]; then
+ IFS='=' read -r edit_command line <<<"$edit_command_line"
+fi
+
IFS='' read -r -d '' email_html_part <<EOF
Content-Type: text/html; encoding=${EMAIL_CHARSET}
Content-Disposition: inline
Content-Transfer-Encoding: 8bit
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0; padding: 0;">
-<body style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 14px; width: 100% !important; min-height: 100%; line-height: 1.6; background: #f6f6f6; margin:0; padding: 0;">
-<table>
- <tbody>
- <tr>
- <td style="vertical-align: top;" valign="top"></td>
- <td width="700" style="vertical-align: top; display: block !important; max-width: 700px !important; clear: both !important; margin: 0 auto; padding: 0;" valign="top">
- <div style="max-width: 700px; display: block; margin: 0 auto; padding: 20px;">
- <table width="100%" cellpadding="0" cellspacing="0" style="background: #fff; border: 1px solid #e9e9e9;">
+<!doctype html>
+<html xmlns="http://www.w3.org/1999/xhtml" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office">
+<head>
+ <title>
+ </title>
+ <!--[if !mso]><!-->
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
+ <!--<![endif]-->
+ <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+ <meta name="viewport" content="width=device-width, initial-scale=1">
+ <style type="text/css">
+ #outlook a { padding:0; }
+ body { margin:0;padding:0;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%; }
+ table, td { border-collapse:collapse;mso-table-lspace:0pt;mso-table-rspace:0pt; }
+ img { border:0;height:auto;line-height:100%; outline:none;text-decoration:none;-ms-interpolation-mode:bicubic; }
+ p { display:block;margin:13px 0; }
+ </style>
+ <!--[if mso]>
+ <xml>
+ <o:OfficeDocumentSettings>
+ <o:AllowPNG/>
+ <o:PixelsPerInch>96</o:PixelsPerInch>
+ </o:OfficeDocumentSettings>
+ </xml>
+ <![endif]-->
+ <!--[if lte mso 11]>
+ <style type="text/css">
+ .mj-outlook-group-fix { width:100% !important; }
+ </style>
+ <![endif]-->
+ <!--[if !mso]><!-->
+ <link href="https://fonts.googleapis.com/css2?family=Open+Sans:wght@300;400;500;600;700&display=swap" rel="stylesheet" type="text/css">
+ <link href="https://fonts.googleapis.com/css?family=Ubuntu:300,400,500,700" rel="stylesheet" type="text/css">
+ <style type="text/css">
+ @import url(https://fonts.googleapis.com/css2?family=Open+Sans:wght@300;400;500;600;700&display=swap);
+ @import url(https://fonts.googleapis.com/css?family=Ubuntu:300,400,500,700);
+ </style>
+ <!--<![endif]-->
+ <style type="text/css">
+ @media only screen and (min-width:100px) {
+ .mj-column-px-130 { width:130px !important; max-width: 130px; }
+ .mj-column-per-50 { width:50% !important; max-width: 50%; }
+ .mj-column-per-70 { width:70% !important; max-width: 70%; }
+ .mj-column-per-30 { width:30% !important; max-width: 30%; }
+ .mj-column-per-100 { width:100% !important; max-width: 100%; }
+ .mj-column-px-66 { width:66px !important; max-width: 66px; }
+ .mj-column-px-400 { width:400px !important; max-width: 400px; }
+ }
+ </style>
+ <style type="text/css">
+ @media only screen and (max-width:100px) {
+ table.mj-full-width-mobile { width: 100% !important; }
+ td.mj-full-width-mobile { width: auto !important; }
+ }
+ </style>
+</head>
+<body style="word-spacing:normal;">
+<div class="svgbg" style="background-image: url('https://staging.netdata.cloud/static/email/img/isotype_600.png'); background-repeat: no-repeat; background-position: top center; background-size: 600px 192px;">
+ <!--[if mso | IE]><table align="center" border="0" cellpadding="0" cellspacing="0" class="" style="width:600px;" width="600" ><tr><td style="line-height:0px;font-size:0px;mso-line-height-rule:exactly;"><![endif]-->
+ <div style="margin:0px auto;max-width:600px;">
+ <table align="center" border="0" cellpadding="0" cellspacing="0" role="presentation" style="width:100%;">
+ <tbody>
+ <tr>
+ <td style="direction:ltr;font-size:0px;padding:20px 0;padding-bottom:50px;padding-left:0;text-align:left;">
+ <!--[if mso | IE]><table role="presentation" border="0" cellpadding="0" cellspacing="0"><tr><td class="" style="vertical-align:top;width:130px;" ><![endif]-->
+ <div class="mj-column-px-130 mj-outlook-group-fix" style="font-size:0px;text-align:left;direction:ltr;display:inline-block;vertical-align:top;width:130px;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" style="vertical-align:top;" width="100%">
+ <tbody>
+ <tr>
+ <td align="center" style="font-size:0px;padding:10px 25px;padding-right:0;padding-left:0;word-break:break-word;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" style="border-collapse:collapse;border-spacing:0px;">
<tbody>
<tr>
- <td bgcolor="#eee" style="padding: 5px 20px 5px 20px; background-color: #eee;">
- <div style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 20px; color: #777; font-weight: bold;">netdata notification</div>
- </td>
+ <td style="width:130px;">
+ <img alt="Netdata Logo" height="auto" src="https://app.netdata.cloud/static/email/img/full_logo.png" style="border:0;display:block;outline:none;text-decoration:none;height:auto;width:100%;font-size:13px;" width="130">
+ </td>
</tr>
+ </tbody>
+ </table>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td><td class="" style="vertical-align:top;width:300px;" ><![endif]-->
+ <div class="mj-column-per-50 mj-outlook-group-fix" style="font-size:0px;text-align:left;direction:ltr;display:inline-block;vertical-align:top;width:50%;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" width="100%">
+ <tbody>
+ <tr>
+ <td style="vertical-align:top;padding-top:4px;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" style width="100%">
+ <tbody>
<tr>
- <td bgcolor="${color}" style="font-size: 16px; vertical-align: top; font-weight: 400; text-align: center; margin: 0; padding: 10px; color: #ffffff; background: ${color} !important; border: 1px solid ${color}; border-top-color: ${color};" align="center" valign="top">
- <h1 style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-weight: 400; margin: 0;">${host} ${status_message}</h1>
+ <td align="left" style="font-size:0px;padding:10px 25px;padding-left:10px;word-break:break-word;">
+ <div style="font-family:Open Sans, sans-serif;font-size:16px;line-height:1;text-align:left;color:#35414A;">Notification</div>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table><![endif]-->
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table><table align="center" border="0" cellpadding="0" cellspacing="0" class="no-collapse-outlook" style="width:600px;" width="600" ><tr><td style="line-height:0px;font-size:0px;mso-line-height-rule:exactly;"><![endif]-->
+ <div class="no-collapse" style="border-collapse: initial; margin: 0px auto; border-radius: 4px; max-width: 600px;">
+ <table align="center" border="0" cellpadding="0" cellspacing="0" role="presentation" style="width:100%;border-radius:4px;">
+ <tbody>
+ <tr>
+ <td style="border:1px solid ${border_color};direction:ltr;font-size:0px;padding:20px 0;text-align:center;">
+ <!--[if mso | IE]><table role="presentation" border="0" cellpadding="0" cellspacing="0"><tr><td class="set-font-outlook" width="600px" ><table align="center" border="0" cellpadding="0" cellspacing="0" class="set-font-outlook" style="width:598px;" width="598" ><tr><td style="line-height:0px;font-size:0px;mso-line-height-rule:exactly;"><![endif]-->
+ <div class="set-font" style="font-family: 'IBM Plex Sans', sans-serif; margin: 0px auto; max-width: 598px;">
+ <table align="center" border="0" cellpadding="0" cellspacing="0" role="presentation" style="width:100%;">
+ <tbody>
+ <tr>
+ <td style="direction:ltr;font-size:0px;padding:20px 0;padding-bottom:0;padding-top:0;text-align:center;">
+ <!--[if mso | IE]><table role="presentation" border="0" cellpadding="0" cellspacing="0"><tr><td class="" style="vertical-align:top;width:418.6px;" ><![endif]-->
+ <div class="mj-column-per-70 mj-outlook-group-fix" style="font-size:0px;text-align:left;direction:ltr;display:inline-block;vertical-align:top;width:70%;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" style="vertical-align:top;" width="100%">
+ <tbody>
+ <tr>
+ <td align="left" style="font-size:0px;padding:10px 25px;padding-top:15px;word-break:break-word;">
+ <div style="font-family:Open Sans, sans-serif;font-size:20px;font-weight:700;line-height:1;text-align:left;color:#35414A;">${name}</div>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td><td class="" style="vertical-align:top;width:179.4px;" ><![endif]-->
+ <div class="mj-column-per-30 mj-outlook-group-fix" style="font-size:0px;text-align:left;direction:ltr;display:inline-block;vertical-align:top;width:30%;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" style="vertical-align:top;" width="100%">
+ <tbody>
+ <tr>
+ <td align="right" style="font-size:0px;padding:10px 25px;padding-right:25px;word-break:break-word;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" style="border-collapse:collapse;border-spacing:0px;">
+ <tbody>
+ <tr>
+ <td style="width:100px;">
+ <img height="auto" src="${alarm_badge}" style="border:0;display:block;outline:none;text-decoration:none;height:auto;width:100%;font-size:13px;" width="100"/>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table><![endif]-->
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table></td></tr><tr><td class="set-font-outlook" width="600px" ><table align="center" border="0" cellpadding="0" cellspacing="0" class="set-font-outlook" style="width:598px;" width="598" ><tr><td style="line-height:0px;font-size:0px;mso-line-height-rule:exactly;"><![endif]-->
+ <div class="set-font" style="font-family: 'IBM Plex Sans', sans-serif; margin: 0px auto; max-width: 598px;">
+ <table align="center" border="0" cellpadding="0" cellspacing="0" role="presentation" style="width:100%;">
+ <tbody>
+ <tr>
+ <td style="direction:ltr;font-size:0px;padding:0;text-align:center;">
+ <!--[if mso | IE]><table role="presentation" border="0" cellpadding="0" cellspacing="0"><tr><td class="" style="vertical-align:top;width:598px;" ><![endif]-->
+ <div class="mj-column-per-100 mj-outlook-group-fix" style="font-size:0px;text-align:left;direction:ltr;display:inline-block;vertical-align:top;width:100%;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" style="vertical-align:top;" width="100%">
+ <tbody>
+ <tr>
+ <td align="left" style="font-size:0px;padding:10px 25px;padding-top:0;word-break:break-word;">
+ <div style="font-family:IBM Plex Sans, sans-serif;font-size:16px;line-height:1;text-align:left;color:#35414A;">on ${host}</div>
</td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table><![endif]-->
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table></td></tr><tr><td class="set-font-outlook" width="600px" ><table align="center" border="0" cellpadding="0" cellspacing="0" class="set-font-outlook" style="width:598px;" width="598" ><tr><td style="line-height:0px;font-size:0px;mso-line-height-rule:exactly;"><![endif]-->
+ <div class="set-font" style="font-family: 'IBM Plex Sans', sans-serif; margin: 0px auto; max-width: 598px;">
+ <table align="center" border="0" cellpadding="0" cellspacing="0" role="presentation" style="width:100%;">
+ <tbody>
+ <tr>
+ <td style="direction:ltr;font-size:0px;padding:20px 0;text-align:center;">
+ <!--[if mso | IE]><table role="presentation" border="0" cellpadding="0" cellspacing="0"><tr><td class="" style="vertical-align:top;width:598px;" ><![endif]-->
+ <div class="mj-column-per-100 mj-outlook-group-fix" style="font-size:0px;text-align:left;direction:ltr;display:inline-block;vertical-align:top;width:100%;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" style="vertical-align:top;" width="100%">
+ <tbody>
+ <tr>
+ <td align="left" style="font-size:0px;padding:10px 25px;padding-top:0;word-break:break-word;">
+ <div style="font-family:Open Sans, sans-serif;font-size:26px;font-weight:700;line-height:1;text-align:left;color:#35414A;"><span style="color: ${text_color}; font-size:26px; background: ${background_color}; padding:4px 24px; border-radius: 36px">${value_string}
+ </span></div>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table><![endif]-->
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table></td></tr><tr><td class="set-font-outlook" width="600px" ><table align="center" border="0" cellpadding="0" cellspacing="0" class="set-font-outlook" style="width:598px;" width="598" ><tr><td style="line-height:0px;font-size:0px;mso-line-height-rule:exactly;"><![endif]-->
+ <div class="set-font" style="font-family: 'IBM Plex Sans', sans-serif; margin: 0px auto; max-width: 598px;">
+ <table align="center" border="0" cellpadding="0" cellspacing="0" role="presentation" style="width:100%;">
+ <tbody>
+ <tr>
+ <td style="direction:ltr;font-size:0px;padding:20px 0;padding-bottom:0;padding-top:0;text-align:center;">
+ <!--[if mso | IE]><table role="presentation" border="0" cellpadding="0" cellspacing="0"><tr><td class="" style="vertical-align:top;width:598px;" ><![endif]-->
+ <div class="mj-column-per-100 mj-outlook-group-fix" style="font-size:0px;text-align:left;direction:ltr;display:inline-block;vertical-align:top;width:100%;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" style="vertical-align:top;" width="100%">
+ <tbody>
+ <tr>
+ <td align="left" style="font-size:0px;padding:10px 25px;padding-top:0;word-break:break-word;">
+ <div style="font-family:Open Sans, sans-serif;font-size:16px;line-height:21px;text-align:left;color:#35414A;">Details: ${info}</div>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table><![endif]-->
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table></td></tr><tr><td class="set-font-outlook" width="600px" ><table align="center" border="0" cellpadding="0" cellspacing="0" class="set-font-outlook" style="width:598px;" width="598" ><tr><td style="line-height:0px;font-size:0px;mso-line-height-rule:exactly;"><![endif]-->
+ <div class="set-font" style="font-family: 'IBM Plex Sans', sans-serif; margin: 0px auto; max-width: 598px;">
+ <table align="center" border="0" cellpadding="0" cellspacing="0" role="presentation" style="width:100%;">
+ <tbody>
+ <tr>
+ <td style="direction:ltr;font-size:0px;padding:20px 0;padding-bottom:0;padding-top:0;text-align:center;">
+ <!--[if mso | IE]><table role="presentation" border="0" cellpadding="0" cellspacing="0"><tr><td class="" style="vertical-align:top;width:598px;" ><![endif]-->
+ <div class="mj-column-per-100 mj-outlook-group-fix" style="font-size:0px;text-align:left;direction:ltr;display:inline-block;vertical-align:top;width:100%;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" style="vertical-align:top;" width="100%">
+ <tbody>
+ <tr>
+ <td align="center" vertical-align="middle" style="font-size:0px;padding:10px 25px;word-break:break-word;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" style="border-collapse:separate;width:100%;line-height:100%;">
+ <tr>
+ <td
+ align="center" bgcolor="${border_color}" role="presentation" style="border:none;border-radius:3px;cursor:auto;height:44px;background:${border_color};" valign="middle">
+ <p style="display:block;background:${border_color};color:#ffffff;font-size:13px;font-weight:600;line-height:44px;margin:0;text-decoration:none;text-transform:none;mso-padding-alt:0px;border-radius:3px;">
+ <a href="${goto_url}" style="color: ${action_text_color}; text-decoration: none; width: 100%; display: inline-block">GO TO CHART</a>
+ </p>
+ </td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table><![endif]-->
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table></td></tr></table><![endif]-->
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table><![endif]-->
+ <div style="height:32px;line-height:32px;">&#8202;</div>
+ <!--[if mso | IE]><table align="center" border="0" cellpadding="0" cellspacing="0" class="set-font-outlook" style="width:600px;" width="600" ><tr><td style="line-height:0px;font-size:0px;mso-line-height-rule:exactly;"><![endif]-->
+ <div class="set-font" style="font-family: 'IBM Plex Sans', sans-serif; background: ${background_color}; background-color: ${background_color}; margin: 0px auto; border-radius: 4px; max-width: 600px;">
+ <table align="center" border="0" cellpadding="0" cellspacing="0" role="presentation" style="background:${background_color};background-color:${background_color};width:100%;border-radius:4px;">
+ <tbody>
+ <tr>
+ <td style="direction:ltr;font-size:0px;padding:20px 0;text-align:center;">
+ <!--[if mso | IE]><table role="presentation" border="0" cellpadding="0" cellspacing="0"><tr><td class="" style="vertical-align:top;width:600px;" ><![endif]-->
+ <div class="mj-column-per-100 mj-outlook-group-fix" style="font-size:0px;text-align:left;direction:ltr;display:inline-block;vertical-align:top;width:100%;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" style="vertical-align:top;" width="100%">
+ <tbody>
+ <tr>
+ <td align="left" style="font-size:0px;padding:10px 25px;padding-bottom:6px;word-break:break-word;">
+ <div style="font-family:Open Sans, sans-serif;font-size:18px;line-height:1;text-align:left;color:#35414A;">Chart:
+ <span style="font-weight:700; font-size:20px">${chart}</span></div>
+ </td>
+ </tr>
+ <tr>
+ <td align="left" style="font-size:0px;padding:10px 25px;padding-top:0;word-break:break-word;">
+ <div style="font-family:Open Sans, sans-serif;font-size:18px;line-height:1;text-align:left;color:#35414A;">Family:
+ <span style="font-weight:700; font-size:20px">${family}</span></div>
+ </td>
+ </tr>
+ <tr>
+ <td align="left" style="font-size:0px;padding:10px 25px;padding-top:4px;word-break:break-word;">
+ <div style="font-family:Open Sans, sans-serif;font-size:14px;line-height:1;text-align:left;color:#35414A;">${rich_status_raised_for}</div>
+ </td>
+ </tr>
+ <tr>
+ <td align="center" style="font-size:0px;padding:10px 25px;word-break:break-word;">
+ <p style="border-top:solid 1px lightgrey;font-size:1px;margin:0px auto;width:100%;">
+ </p>
+ <!--[if mso | IE]><table align="center" border="0" cellpadding="0" cellspacing="0" style="border-top:solid 1px lightgrey;font-size:1px;margin:0px auto;width:550px;" role="presentation" width="550px" ><tr><td style="height:0;line-height:0;"> &nbsp;
+ </td></tr></table><![endif]-->
+ </td>
+ </tr>
+ <tr>
+ <td align="left" style="font-size:0px;padding:10px 25px;padding-bottom:6px;word-break:break-word;">
+ <div style="font-family:Open Sans, sans-serif;font-size:16px;line-height:1;text-align:left;color:#35414A;">On
+ <span style="font-weight:600">${date}</span></div>
+ </td>
+ </tr>
+ <tr>
+ <td align="left" style="font-size:0px;padding:10px 25px;padding-top:0;word-break:break-word;">
+ <div style="font-family:Open Sans, sans-serif;font-size:16px;line-height:1;text-align:left;color:#35414A;">By:
+ <span style="font-weight:600">${host}</span></div>
+ </td>
+ </tr>
+ <tr>
+ <td align="left" style="font-size:0px;padding:10px 25px;padding-top:4px;word-break:break-word;">
+ <div style="font-family:Open Sans, sans-serif;font-size:14px;line-height:1;text-align:left;color:#35414A;">Global time:
+ <span style="font-weight:600">${date_utc}</span></div>
+ </td>
+ </tr>
+ <tr>
+ <td align="center" style="font-size:0px;padding:10px 25px;word-break:break-word;">
+ <p style="border-top:solid 1px lightgrey;font-size:1px;margin:0px auto;width:100%;">
+ </p>
+ <!--[if mso | IE]><table align="center" border="0" cellpadding="0" cellspacing="0" style="border-top:solid 1px lightgrey;font-size:1px;margin:0px auto;width:550px;" role="presentation" width="550px" ><tr><td style="height:0;line-height:0;"> &nbsp;
+ </td></tr></table><![endif]-->
+ </td>
+ </tr>
+ <tr>
+ <td align="left" style="font-size:0px;padding:10px 25px;padding-bottom:6px;word-break:break-word;">
+ <div style="font-family:Open Sans, sans-serif;font-size:16px;line-height:1;text-align:left;color:#35414A;">Classification:
+ <span style="font-weight:600">${classification}</span></div>
+ </td>
+ </tr>
+ <tr>
+ <td align="left" style="font-size:0px;padding:10px 25px;padding-top:0;word-break:break-word;">
+ <div style="font-family:Open Sans, sans-serif;font-size:16px;line-height:1;text-align:left;color:#35414A;">Role:
+ <span style="font-weight:600">${roles}</span></div>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table><![endif]-->
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table><table align="center" border="0" cellpadding="0" cellspacing="0" class="set-font-outlook" style="width:600px;" width="600" ><tr><td style="line-height:0px;font-size:0px;mso-line-height-rule:exactly;"><![endif]-->
+ <div class="set-font" style="font-family: 'IBM Plex Sans', sans-serif; margin: 0px auto; max-width: 600px;">
+ <table align="center" border="0" cellpadding="0" cellspacing="0" role="presentation" style="width:100%;">
+ <tbody>
+ <tr>
+ <td style="direction:ltr;font-size:0px;padding:20px 0;padding-left:25px;text-align:left;">
+ <!--[if mso | IE]><table role="presentation" border="0" cellpadding="0" cellspacing="0"><tr><td class="" style="vertical-align:top;width:66px;" ><![endif]-->
+ <div class="mj-column-px-66 mj-outlook-group-fix" style="font-size:0px;text-align:left;direction:ltr;display:inline-block;vertical-align:top;width:66px;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" width="100%">
+ <tbody>
+ <tr>
+ <td style="vertical-align:top;padding:0;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" style width="100%">
+ <tbody>
+ <tr>
+ <td align="left" style="font-size:0px;padding:10px 25px;padding-right:0;padding-left:0;word-break:break-word;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" style="border-collapse:collapse;border-spacing:0px;">
+ <tbody>
+ <tr>
+ <td style="width:48px;">
+ <img height="auto" src="https://app.netdata.cloud/static/email/img/community_icon.png" style="border:0;display:block;outline:none;text-decoration:none;height:auto;width:100%;font-size:13px;" width="48">
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td><td align="left" class="" style="vertical-align:top;width:400px;" ><![endif]-->
+ <div class="mj-column-px-400 mj-outlook-group-fix" style="font-size:0px;text-align:left;direction:ltr;display:inline-block;vertical-align:top;width:400px;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" width="100%">
+ <tbody>
+ <tr>
+ <td style="vertical-align:top;padding-left:0;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" style width="100%">
+ <tbody>
+ <tr>
+ <td align="left" style="font-size:0px;padding:10px 25px;padding-left:0;word-break:break-word;">
+ <div style="font-family:Open Sans, sans-serif;font-size:16px;font-weight:700;line-height:1;text-align:left;color:#35414A;">Want to know more about this alert?</div>
+ </td>
+ </tr>
+ <tr>
+ <td align="left" style="font-size:0px;padding:10px 25px;padding-left:0;word-break:break-word;">
+ <div style="font-family:Open Sans, sans-serif;font-size:14px;line-height:1.3;text-align:left;color:#35414A;">Discuss and troubleshoot with others on the Netdata <a href="https://community.netdata.cloud/" class="link" style="color: #00AB44; text-decoration: none;">community forums</a></div>
+ </td>
</tr>
+ </tbody>
+ </table>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table><![endif]-->
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table><table align="center" border="0" cellpadding="0" cellspacing="0" class="set-font-outlook" style="width:600px;" width="600" ><tr><td style="line-height:0px;font-size:0px;mso-line-height-rule:exactly;"><![endif]-->
+ <div class="set-font" style="font-family: 'IBM Plex Sans', sans-serif; margin: 0px auto; max-width: 600px;">
+ <table align="center" border="0" cellpadding="0" cellspacing="0" role="presentation" style="width:100%;">
+ <tbody>
+ <tr>
+ <td style="direction:ltr;font-size:0px;padding:20px 0;padding-left:25px;text-align:left;">
+ <!--[if mso | IE]><table role="presentation" border="0" cellpadding="0" cellspacing="0"><tr><td class="" style="vertical-align:top;width:66px;" ><![endif]-->
+ <div class="mj-column-px-66 mj-outlook-group-fix" style="font-size:0px;text-align:left;direction:ltr;display:inline-block;vertical-align:top;width:66px;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" width="100%">
+ <tbody>
+ <tr>
+ <td style="vertical-align:top;padding:0;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" style width="100%">
+ <tbody>
<tr>
- <td style="vertical-align: top;" valign="top">
- <div style="margin: 0; padding: 20px; max-width: 700px;">
- <table width="100%" cellpadding="0" cellspacing="0" style="max-width:700px">
- <tbody>
- <tr>
- <td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 18px; vertical-align: top; margin: 0; padding:0 0 20px;" align="left" valign="top">
- <span>${chart}</span>
- <span style="display: block; color: #666666; font-size: 12px; font-weight: 300; line-height: 1; text-transform: uppercase;">Chart</span>
- </td>
- </tr>
- <tr style="margin: 0; padding: 0;">
- <td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 18px; vertical-align: top; margin: 0; padding: 0 0 20px;" align="left" valign="top">
- <span><b>${alarm}</b>${info_html}</span>
- <span style="display: block; color: #666666; font-size: 12px; font-weight: 300; line-height: 1; text-transform: uppercase;">Alarm</span>
- </td>
- </tr>
- <tr>
- <td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 18px; vertical-align: top; margin: 0; padding: 0 0 20px;" align="left" valign="top">
- <span>${family}</span>
- <span style="display: block; color: #666666; font-size: 12px; font-weight: 300; line-height: 1; text-transform: uppercase;">Family</span>
- </td>
- </tr>
- <tr style="margin: 0; padding: 0;">
- <td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 18px; vertical-align: top; margin: 0; padding: 0 0 20px;" align="left" valign="top">
- <span>${severity}</span>
- <span style="display: block; color: #666666; font-size: 12px; font-weight: 300; line-height: 1; text-transform: uppercase;">Severity</span>
- </td>
- </tr>
- <tr style="margin: 0; padding: 0;">
- <td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 18px; vertical-align: top; margin: 0; padding: 0 0 20px;" align="left" valign="top"><span>${date}</span>
- <span>${raised_for_html}</span> <span style="display: block; color: #666666; font-size: 12px; font-weight: 300; line-height: 1; text-transform: uppercase;">Time</span>
- </td>
- </tr>
- <tr style="margin: 0; padding: 0;">
- <td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 18px; vertical-align: top; margin: 0; padding: 0 0 20px;" align="left" valign="top">
- <span>${calc_expression}</span>
- <span style="display: block; color: #666666; font-size: 12px; font-weight: 300; line-height: 1; text-transform: uppercase;">Evaluated Expression</span>
- </td>
- </tr>
- <tr style="margin: 0; padding: 0;">
- <td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 18px; vertical-align: top; margin: 0; padding: 0 0 20px;" align="left" valign="top">
- <span>${calc_param_values}</span>
- <span style="display: block; color: #666666; font-size: 12px; font-weight: 300; line-height: 1; text-transform: uppercase;">Expression Variables</span>
- </td>
- </tr>
- <tr style="margin: 0; padding: 0;">
- <td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 18px; vertical-align: top; margin: 0; padding: 0 0 20px;" align="left" valign="top">
- The host has ${total_warnings} WARNING and ${total_critical} CRITICAL alarm(s) raised.
- </td>
- </tr>
-
- <tr style="margin: 0; padding: 0;">
- <td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 18px; vertical-align: top; margin: 0; padding: 0 0 20px;">
- <a href="${goto_url}" style="font-size: 14px; color: #ffffff; text-decoration: none; line-height: 1.5; font-weight: bold; text-align: center; display: inline-block; text-transform: capitalize; background: #35568d; border-width: 1px; border-style: solid; border-color: #2b4c86; margin: 0; padding: 10px 15px;" target="_blank">View Netdata</a>
- </td>
- </tr>
- <tr style="text-align: center; margin: 0; padding: 0;">
- <td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 11px; vertical-align: top; margin: 0; padding: 10px 0 0 0; color: #666666;" align="center" valign="bottom">The source of this alarm is line <code>${src}</code><br/>(alarms are configurable, edit this file to adapt the alarm to your needs)
- </td>
- </tr>
- <tr style="text-align: center; margin: 0; padding: 0;">
- <td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 12px; vertical-align: top; margin:0; padding: 20px 0 0 0; color: #666666; border-top: 1px solid #f0f0f0;" align="center" valign="bottom">Sent by
- <a href="https://mynetdata.io/" target="_blank">netdata</a>, the real-time performance and health monitoring, on <code>${host}</code>.
- </td>
- </tr>
- </tbody>
- </table>
- </div>
+ <td align="left" style="font-size:0px;padding:10px 25px;padding-right:0;padding-left:0;word-break:break-word;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" style="border-collapse:collapse;border-spacing:0px;">
+ <tbody>
+ <tr>
+ <td style="width:48px;">
+ <img height="auto" src="https://app.netdata.cloud/static/email/img/configure_icon.png" style="border:0;display:block;outline:none;text-decoration:none;height:auto;width:100%;font-size:13px;" width="48">
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td><td align="left" class="" style="vertical-align:top;width:400px;" ><![endif]-->
+ <div class="mj-column-px-400 mj-outlook-group-fix" style="font-size:0px;text-align:left;direction:ltr;display:inline-block;vertical-align:top;width:400px;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" width="100%">
+ <tbody>
+ <tr>
+ <td style="vertical-align:top;padding-left:0;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" style width="100%">
+ <tbody>
+ <tr>
+ <td align="left" style="font-size:0px;padding:10px 25px;padding-left:0;word-break:break-word;">
+ <div style="font-family:Open Sans, sans-serif;font-size:16px;font-weight:700;line-height:1;text-align:left;color:#35414A;">Need to configure this alert?</div>
+ </td>
+ </tr>
+ <tr>
+ <td align="left" style="font-size:0px;padding:10px 25px;padding-left:0;word-break:break-word;">
+ <div style="font-family:Open Sans, sans-serif;font-size:14px;line-height:1.3;text-align:left;color:#35414A;"><span style="color: #00AB44"><a href="https://learn.netdata.cloud/docs/agent/health/notifications#:~:text=To%20edit%20it%20on%20your,have%20one%20or%20more%20destinations" class="link" style="color: #00AB44; text-decoration: none;">Edit</a></span> this alert's configuration file by logging into $s_host and running the following command:</div>
+ </td>
+ </tr>
+ <tr>
+ <td align="left" style="font-size:0px;padding:10px 25px;padding-top:8px;padding-left:0;word-break:break-word;">
+ <div style="font-family:Open Sans, sans-serif;font-size:12px;line-height:1.3;text-align:left;color:#35414A;">${edit_command} <br>
+ The alarm to edit is at line {${line}}</div>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table><![endif]-->
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table><table align="center" border="0" cellpadding="0" cellspacing="0" class="history-wrapper-outlook" style="width:600px;" width="600" ><tr><td style="line-height:0px;font-size:0px;mso-line-height-rule:exactly;"><![endif]-->
+ <div class="history-wrapper" style="background: #F7F8F8; background-color: #F7F8F8; margin: 0px auto; max-width: 100%;">
+ <table align="center" border="0" cellpadding="0" cellspacing="0" role="presentation" style="background:#F7F8F8;background-color:#F7F8F8;width:100%;">
+ <tbody>
+ <tr>
+ <td style="direction:ltr;font-size:0px;padding:0;padding-bottom:24px;text-align:center;">
+ <!--[if mso | IE]><table role="presentation" border="0" cellpadding="0" cellspacing="0"><tr><td class="set-font-outlook" width="600px" ><table align="center" border="0" cellpadding="0" cellspacing="0" class="set-font-outlook" style="width:600px;" width="600" ><tr><td style="line-height:0px;font-size:0px;mso-line-height-rule:exactly;"><![endif]-->
+ <div class="set-font" style="font-family: 'IBM Plex Sans', sans-serif; margin: 0px auto; max-width: 600px;">
+ <table align="center" border="0" cellpadding="0" cellspacing="0" role="presentation" style="width:100%;">
+ <tbody>
+ <tr>
+ <td style="direction:ltr;font-size:0px;padding:20px 0;padding-bottom:12px;text-align:center;">
+ <!--[if mso | IE]><table role="presentation" border="0" cellpadding="0" cellspacing="0"><tr><td class="" style="vertical-align:top;width:600px;" ><![endif]-->
+ <div class="mj-column-per-100 mj-outlook-group-fix" style="font-size:0px;text-align:left;direction:ltr;display:inline-block;vertical-align:top;width:100%;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" style="vertical-align:top;" width="100%">
+ <tbody>
+ <tr>
+ <td align="left" style="font-size:0px;padding:10px 25px;word-break:break-word;">
+ <div style="font-family:Open Sans, sans-serif;font-size:16px;line-height:1;text-align:center;color:#35414A;">The node has
+ <span style="font-weight:600">${total_warnings} warning</span>
+ and
+ <span style="font-weight:600">${total_critical} critical</span>
+ additional active alert(s)</div>
</td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table><![endif]-->
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ ${CRIT_ALARMS}
+ ${WARN_ALARMS}
+ <!--[if mso | IE]></td></tr></table></td></tr></table><![endif]-->
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table><table align="center" border="0" cellpadding="0" cellspacing="0" class="set-font-outlook" style="width:600px;" width="600" ><tr><td style="line-height:0px;font-size:0px;mso-line-height-rule:exactly;"><![endif]-->
+ <div class="set-font" style="font-family: 'IBM Plex Sans', sans-serif; margin: 0px auto; max-width: 600px;">
+ <table align="center" border="0" cellpadding="0" cellspacing="0" role="presentation" style="width:100%;">
+ <tbody>
+ <tr>
+ <td style="direction:ltr;font-size:0px;padding:20px 0;text-align:center;">
+ <!--[if mso | IE]><table role="presentation" border="0" cellpadding="0" cellspacing="0"><tr><td class="" style="vertical-align:top;width:600px;" ><![endif]-->
+ <div class="mj-column-per-100 mj-outlook-group-fix" style="font-size:0px;text-align:left;direction:ltr;display:inline-block;vertical-align:top;width:100%;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" width="100%">
+ <tbody>
+ <tr>
+ <td style="vertical-align:top;padding-top:44px;padding-bottom:12px;">
+ <table border="0" cellpadding="0" cellspacing="0" role="presentation" style width="100%">
+ <tbody>
+ <tr>
+ <td align="left" style="font-size:0px;padding:10px 25px;padding-top:0;padding-bottom:0;word-break:break-word;">
+ <div style="font-family:Open Sans, sans-serif;font-size:13px;line-height:1;text-align:center;color:#35414A;">© Netdata 2021 - The real-time performance and health monitoring</div>
+ </td>
</tr>
</tbody>
- </table>
- </div>
+ </table>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table><![endif]-->
</td>
- </tr>
- </tbody>
-</table>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <!--[if mso | IE]></td></tr></table><![endif]-->
+</div>
</body>
</html>
EOF
send_email <<EOF
To: ${to_email}
-Subject: ${host} ${status_message} - ${name//_/ } - ${chart}
+Subject: ${html_email_subject}
MIME-Version: 1.0
Content-Type: multipart/alternative; boundary="multipart-boundary"
${email_thread_headers}
+X-Netdata-Severity: ${status,,}
+X-Netdata-Alert-Name: $name
+X-Netdata-Chart: $chart
+X-Netdata-Family: $family
+X-Netdata-Classification: $classification
+X-Netdata-Host: $host
+X-Netdata-Role: $roles
This is a MIME-encoded multipart message
diff --git a/health/notifications/custom/README.md b/health/notifications/custom/README.md
index 04376d555..bcb09ef53 100644
--- a/health/notifications/custom/README.md
+++ b/health/notifications/custom/README.md
@@ -7,7 +7,11 @@ custom_edit_url: https://github.com/netdata/netdata/edit/master/health/notificat
Netdata allows you to send custom notifications to any endpoint you choose.
-To configure custom notifications, you will need to customize `health_alarm_notify.conf`. You can look at the other senders in `/usr/libexec/netdata/plugins.d/alarm-notify.sh` for examples of how to modify the `custom_sender()` function in `health_alarm_notify.conf`. Ensure you follow the instructions of changing any configuration file to [persist your configuration](/docs/configuration-guide.md#persist-my-configuration).
+To configure custom notifications, you will need to customize `health_alarm_notify.conf`. Open the file for editing
+using [`edit-config`](/docs/configure/nodes.md#use-edit-config-to-edit-configuration-files) from the [Netdata config
+directory](/docs/configure/nodes.md#the-netdata-config-directory), which is typically at `/etc/netdata`.
+
+You can look at the other senders in `/usr/libexec/netdata/plugins.d/alarm-notify.sh` for examples of how to modify the `custom_sender()` function in `health_alarm_notify.conf`.
As with other notifications, you will also need to define the recipient list in `DEFAULT_RECIPIENT_CUSTOM` and/or the `role_recipients_custom` array.
diff --git a/health/notifications/email/README.md b/health/notifications/email/README.md
index ebd7f4b8c..82786fca1 100644
--- a/health/notifications/email/README.md
+++ b/health/notifications/email/README.md
@@ -11,9 +11,9 @@ Netdata sends all emails as user `netdata`, so make sure your `sendmail` works f
email notifications look like this:
-![image](https://cloud.githubusercontent.com/assets/2662304/18407294/e9218c68-7714-11e6-8739-e4dd8a498252.png)
+![image](https://user-images.githubusercontent.com/1905463/133216974-a2ca0e4f-787b-4dce-b1b2-9996a8c5f718.png)
-## configuration
+## Configuration
To edit `health_alarm_notify.conf` on your system run `/etc/netdata/edit-config health_alarm_notify.conf`.
@@ -38,6 +38,20 @@ Where `[ROLE]` is the role you want to test. The default (if you don't give a `[
Note that in versions before 1.16, the plugins.d directory may be installed in a different location in certain OSs (e.g. under `/usr/lib/netdata`).
You can always find the location of the alarm-notify.sh script in `netdata.conf`.
+## Filtering
+
+Every notification email (both the plain text and the rich html versions) from the Netdata agent, contain a set of custom email headers that can be used for filtering using an email client. Example:
+
+```
+X-Netdata-Severity: warning
+X-Netdata-Alert-Name: inbound_packets_dropped_ratio
+X-Netdata-Chart: net_packets.enp2s0
+X-Netdata-Family: enp2s0
+X-Netdata-Classification: System
+X-Netdata-Host: winterland
+X-Netdata-Role: sysadmin
+```
+
## Simple SMTP transport configuration
If you want an alternative to `sendmail` in order to have a simple MTA configuration for sending emails and auth to an existing SMTP server, you can do the following:
diff --git a/health/notifications/health_alarm_notify.conf b/health/notifications/health_alarm_notify.conf
index e851a530c..873c7c353 100755
--- a/health/notifications/health_alarm_notify.conf
+++ b/health/notifications/health_alarm_notify.conf
@@ -253,13 +253,13 @@ DYNATRACE_TOKEN=""
DYNATRACE_SPACE=""
# Generate a Server Tag. On the Dynatrace Server go to Settings --> Tags --> Manually applied tags create the Tag
-# The NetData alarm will be sent as a Dynatrace Event to be correlated with all those hosts tagged with this Tag
+# The Netdata alarm will be sent as a Dynatrace Event to be correlated with all those hosts tagged with this Tag
# you created.
# Required
DYNATRACE_TAG_VALUE=""
# Change this to what you want
-DYNATRACE_ANNOTATION_TYPE="NetData Alarm"
+DYNATRACE_ANNOTATION_TYPE="Netdata Alarm"
# This can be CUSTOM_INFO, CUSTOM_ANNOTATION, CUSTOM_CONFIGURATION, CUSTOM_DEPLOYMENT
# Applying default value
diff --git a/health/notifications/syslog/README.md b/health/notifications/syslog/README.md
index 456394d2f..360f6844d 100644
--- a/health/notifications/syslog/README.md
+++ b/health/notifications/syslog/README.md
@@ -17,7 +17,7 @@ netdata WARNING on hostname at Tue Apr 3 09:00:00 EDT 2018: disk_space._ out of
System log targets are configured as recipients in [`/etc/netdata/health_alarm_notify.conf`](https://github.com/netdata/netdata/blob/36bedc044584dea791fd29455bdcd287c3306cb2/conf.d/health_alarm_notify.conf#L534) (to edit it on your system run `/etc/netdata/edit-config health_alarm_notify.conf`).
-You can als configure per-role targets in the same file a bit further down.
+You can also configure per-role targets in the same file a bit further down.
Targets are defined as follows: