From e970e0b37b8bd7f246feb3f70c4136418225e434 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 1 Dec 2021 07:15:04 +0100 Subject: Adding upstream version 1.32.0. Signed-off-by: Daniel Baumann --- health/Makefile.am | 19 +- health/REFERENCE.md | 60 +- health/health.c | 73 ++- health/health.d/adaptec_raid.conf | 8 +- health/health.d/am2320.conf | 15 - health/health.d/anomalies.conf | 8 +- health/health.d/apache.conf | 17 - health/health.d/apcupsd.conf | 12 +- health/health.d/backend.conf | 12 +- health/health.d/bcache.conf | 8 +- health/health.d/beanstalkd.conf | 4 +- health/health.d/bind_rndc.conf | 4 +- health/health.d/boinc.conf | 16 +- health/health.d/btrfs.conf | 16 +- health/health.d/ceph.conf | 4 +- health/health.d/cgroups.conf | 8 +- health/health.d/cockroachdb.conf | 72 +-- health/health.d/couchdb.conf | 16 - health/health.d/cpu.conf | 16 +- health/health.d/dbengine.conf | 16 +- health/health.d/disks.conf | 26 +- health/health.d/dns_query.conf | 4 +- health/health.d/dnsmasq_dhcp.conf | 4 +- health/health.d/dockerd.conf | 4 +- health/health.d/elasticsearch.conf | 15 - health/health.d/entropy.conf | 4 +- health/health.d/exporting.conf | 29 +- health/health.d/fping.conf | 16 +- health/health.d/fronius.conf | 4 +- health/health.d/gearman.conf | 20 +- health/health.d/geth.conf | 12 + health/health.d/go.d.plugin.conf | 17 + health/health.d/haproxy.conf | 21 +- health/health.d/hdfs.conf | 37 +- health/health.d/httpcheck.conf | 46 +- health/health.d/ioping.conf | 4 +- health/health.d/ipc.conf | 8 +- health/health.d/ipfs.conf | 4 +- health/health.d/ipmi.conf | 8 +- health/health.d/kubelet.conf | 36 +- health/health.d/lighttpd.conf | 17 - health/health.d/linux_power_supply.conf | 4 +- health/health.d/load.conf | 16 +- health/health.d/mdstat.conf | 16 +- health/health.d/megacli.conf | 20 +- health/health.d/memcached.conf | 29 +- health/health.d/memory.conf | 12 +- health/health.d/mongodb.conf | 16 - health/health.d/mysql.conf | 62 +- health/health.d/named.conf | 17 - health/health.d/net.conf | 60 +- health/health.d/netfilter.conf | 4 +- health/health.d/nginx.conf | 17 - health/health.d/nginx_plus.conf | 17 - health/health.d/phpfpm.conf | 17 - health/health.d/pihole.conf | 49 +- health/health.d/portcheck.conf | 26 +- health/health.d/postgres.conf | 16 - health/health.d/processes.conf | 4 +- health/health.d/pulsar.conf | 16 - health/health.d/python.d.plugin.conf | 17 + health/health.d/ram.conf | 48 +- health/health.d/redis.conf | 24 +- health/health.d/retroshare.conf | 19 +- health/health.d/riakkv.conf | 38 +- health/health.d/scaleio.conf | 24 +- health/health.d/softnet.conf | 12 +- health/health.d/squid.conf | 17 - health/health.d/stiebeleltron.conf | 4 +- health/health.d/swap.conf | 10 +- health/health.d/systemdunits.conf | 40 +- health/health.d/tcp_conn.conf | 4 +- health/health.d/tcp_listen.conf | 16 +- health/health.d/tcp_mem.conf | 4 +- health/health.d/tcp_orphans.conf | 4 +- health/health.d/tcp_resets.conf | 16 +- health/health.d/timex.conf | 17 + health/health.d/udp_errors.conf | 8 +- health/health.d/unbound.conf | 24 +- health/health.d/varnish.conf | 12 - health/health.d/vcsa.conf | 48 +- health/health.d/vernemq.conf | 120 ++-- health/health.d/vsphere.conf | 44 +- health/health.d/web_log.conf | 135 ++-- health/health.d/whoisquery.conf | 21 +- health/health.d/wmi.conf | 50 +- health/health.d/x509check.conf | 25 +- health/health.d/zfs.conf | 12 +- health/health.d/zookeeper.conf | 17 - health/health.h | 6 +- health/health_config.c | 153 ++++- health/health_json.c | 18 + health/health_log.c | 64 +- health/notifications/alarm-notify.sh.in | 888 +++++++++++++++++++++++--- health/notifications/custom/README.md | 6 +- health/notifications/email/README.md | 18 +- health/notifications/health_alarm_notify.conf | 4 +- health/notifications/syslog/README.md | 2 +- 98 files changed, 1752 insertions(+), 1395 deletions(-) delete mode 100644 health/health.d/am2320.conf delete mode 100644 health/health.d/apache.conf delete mode 100644 health/health.d/couchdb.conf delete mode 100644 health/health.d/elasticsearch.conf create mode 100644 health/health.d/geth.conf create mode 100644 health/health.d/go.d.plugin.conf delete mode 100644 health/health.d/lighttpd.conf delete mode 100644 health/health.d/mongodb.conf delete mode 100644 health/health.d/named.conf delete mode 100644 health/health.d/nginx.conf delete mode 100644 health/health.d/nginx_plus.conf delete mode 100644 health/health.d/phpfpm.conf delete mode 100644 health/health.d/postgres.conf delete mode 100644 health/health.d/pulsar.conf create mode 100644 health/health.d/python.d.plugin.conf delete mode 100644 health/health.d/squid.conf create mode 100644 health/health.d/timex.conf delete mode 100644 health/health.d/varnish.conf delete mode 100644 health/health.d/zookeeper.conf (limited to 'health') diff --git a/health/Makefile.am b/health/Makefile.am index b963ea0cd..349b86d61 100644 --- a/health/Makefile.am +++ b/health/Makefile.am @@ -25,9 +25,7 @@ install-exec-local: healthconfigdir=$(libconfigdir)/health.d dist_healthconfig_DATA = \ health.d/adaptec_raid.conf \ - health.d/am2320.conf \ health.d/anomalies.conf \ - health.d/apache.conf \ health.d/apcupsd.conf \ health.d/backend.conf \ health.d/bcache.conf \ @@ -39,18 +37,18 @@ dist_healthconfig_DATA = \ health.d/cgroups.conf \ health.d/cpu.conf \ health.d/cockroachdb.conf \ - health.d/couchdb.conf \ health.d/disks.conf \ health.d/dnsmasq_dhcp.conf \ health.d/dns_query.conf \ health.d/dockerd.conf \ - health.d/elasticsearch.conf \ health.d/entropy.conf \ health.d/exporting.conf \ health.d/fping.conf \ + health.d/geth.conf \ health.d/ioping.conf \ health.d/fronius.conf \ health.d/gearman.conf \ + health.d/go.d.plugin.conf \ health.d/haproxy.conf \ health.d/hdfs.conf \ health.d/httpcheck.conf \ @@ -59,26 +57,19 @@ dist_healthconfig_DATA = \ health.d/ipmi.conf \ health.d/isc_dhcpd.conf \ health.d/kubelet.conf \ - health.d/lighttpd.conf \ health.d/linux_power_supply.conf \ health.d/load.conf \ health.d/mdstat.conf \ health.d/megacli.conf \ health.d/memcached.conf \ health.d/memory.conf \ - health.d/mongodb.conf \ health.d/mysql.conf \ - health.d/named.conf \ health.d/net.conf \ health.d/netfilter.conf \ - health.d/nginx.conf \ - health.d/nginx_plus.conf \ health.d/pihole.conf \ - health.d/phpfpm.conf \ health.d/portcheck.conf \ - health.d/postgres.conf \ health.d/processes.conf \ - health.d/pulsar.conf \ + health.d/python.d.plugin.conf \ health.d/qos.conf \ health.d/ram.conf \ health.d/redis.conf \ @@ -86,11 +77,11 @@ dist_healthconfig_DATA = \ health.d/riakkv.conf \ health.d/scaleio.conf \ health.d/softnet.conf \ - health.d/squid.conf \ health.d/stiebeleltron.conf \ health.d/synchronization.conf \ health.d/swap.conf \ health.d/systemdunits.conf \ + health.d/timex.conf \ health.d/tcp_conn.conf \ health.d/tcp_listen.conf \ health.d/tcp_mem.conf \ @@ -98,7 +89,6 @@ dist_healthconfig_DATA = \ health.d/tcp_resets.conf \ health.d/udp_errors.conf \ health.d/unbound.conf \ - health.d/varnish.conf \ health.d/vcsa.conf \ health.d/vernemq.conf \ health.d/vsphere.conf \ @@ -107,6 +97,5 @@ dist_healthconfig_DATA = \ health.d/wmi.conf \ health.d/x509check.conf \ health.d/zfs.conf \ - health.d/zookeeper.conf \ health.d/dbengine.conf \ $(NULL) diff --git a/health/REFERENCE.md b/health/REFERENCE.md index 5ea6b7c5d..f1bb5557d 100644 --- a/health/REFERENCE.md +++ b/health/REFERENCE.md @@ -54,14 +54,17 @@ Netdata parses the following lines. Beneath the table is an in-depth explanation - A few lines use space-separated lists to define how the entity behaves. You can use `*` as a wildcard or prefix with `!` for a negative match. Order is important, too! See our [simple patterns docs](../libnetdata/simple_pattern/) for more examples. +- Lines terminated by a `\` are spliced together with the next line. The backslash is removed and the following line is + joined with the current one. No space is inserted, so you may split a line anywhere, even in the middle of a word. + This comes in handy if your `info` line consists of several sentences. | line | required | functionality | | --------------------------------------------------- | --------------- | ------------------------------------------------------------------------------------- | | [`alarm`/`template`](#alarm-line-alarm-or-template) | yes | Name of the alarm/template. | | [`on`](#alarm-line-on) | yes | The chart this alarm should attach to. | -| [`class`](#alarm-line-class) | no | The general classification of the alarm. | -| [`component`](#alarm-line-component) | no | Specify the component of the class of the alarm. | -| [`type`](#alarm-line-type) | no | The type of error the alarm monitors. | +| [`class`](#alarm-line-class) | no | The general alarm classification. | +| [`type`](#alarm-line-type) | no | What area of the system the alarm monitors. | +| [`component`](#alarm-line-component) | no | Specific component of the type of the alarm. | | [`os`](#alarm-line-os) | no | Which operating systems to run this chart. | | [`hosts`](#alarm-line-hosts) | no | Which hostnames will run this alarm. | | [`plugin`](#alarm-line-plugin) | no | Restrict an alarm or template to only a certain plugin. | @@ -136,24 +139,45 @@ If you create a template using the `disk.io` context, it will apply an alarm to #### Alarm line `class` -Specify the classification of the alarm or template. +This indicates the type of error (or general problem area) that the alarm or template applies to. For example, `Latency` can be used for alarms that trigger on latency issues on network interfaces, web servers, or database systems. Example: -Class can be used to indicate the broader area of the system that the alarm applies to. For example, under the general `Database` class, you can group together alarms that operate on various database systems, like `MySQL`, `CockroachDB`, `CouchDB` etc. Example: +```yaml +class: Latency +``` + +
+Netdata's stock alarms use the following `class` attributes by default: + +| Class | +| ----------------| +| Errors | +| Latency | +| Utilization | +| Workload | + + +
+ +`class` will default to `Unknown` if the line is missing from the alarm configuration. + +#### Alarm line `type` + +Type can be used to indicate the broader area of the system that the alarm applies to. For example, under the general `Database` type, you can group together alarms that operate on various database systems, like `MySQL`, `CockroachDB`, `CouchDB` etc. Example: ```yaml -class: Database +type: Database ```
-Netdata's stock alarms use the following `class` attributes by default, but feel free to adjust for your own requirements. +Netdata's stock alarms use the following `type` attributes by default, but feel free to adjust for your own requirements. -| Class | Description | +| Type | Description | | ------------------------ | ------------------------------------------------------------------------------------------------ | | Ad Filtering | Services related to Ad Filtering (like pi-hole) | | Certificates | Certificates monitoring related | | Cgroups | Alerts for cpu and memory usage of control groups | | Computing | Alerts for shared computing applications (e.g. boinc) | | Containers | Container related alerts (e.g. docker instances) | -| Database | Database systems (e.g. MySQL, Postgress, etc) | +| Database | Database systems (e.g. MySQL, PostgreSQL, etc) | | Data Sharing | Used to group together alerts for data sharing applications | | DHCP | Alerts for dhcp related services | | DNS | Alerts for dns related services | @@ -162,7 +186,7 @@ class: Database | Linux | Services specific to Linux (e.g. systemd) | | Messaging | Alerts for message passing services (e.g. vernemq) | | Netdata | Internal Netdata components monitoring | -| Other | Use as a general class of alerts | +| Other | When an alert doesn't fit in other types. | | Power Supply | Alerts from power supply related services (e.g. apcupsd) | | Search engine | Alerts for search services (e.g. elasticsearch) | | Storage | Class for alerts dealing with storage services (storage devices typically live under `System`) | @@ -174,26 +198,16 @@ class: Database
-If an alarm configuration is missing the `class` line, its value will default to `Unknown`. +If an alarm configuration is missing the `type` line, its value will default to `Unknown`. #### Alarm line `component` -Component can be used to narrow down what the previous `class` value specifies for each alarm or template. Continuing from the previous example, `component` might include `MySQL`, `CockroachDB`, `MongoDB`, all under the same `Database` classification. Example: +Component can be used to narrow down what the previous `type` value specifies for each alarm or template. Continuing from the previous example, `component` might include `MySQL`, `CockroachDB`, `MongoDB`, all under the same `Database` type. Example: ```yaml component: MySQL ``` -As with the `class` line, if `component` is missing from the configuration, its value will default to `Unknown`. - -#### Alarm line `type` - -This indicates the type of error (or general problem area) that the alarm or template applies to. For example, `Latency` can be used for alarms that trigger on latency issues in network interfaces, web servers, or database systems. Example: - -```yaml -type: Latency -``` - -`type` will also (as with `class` and `component`) default to `Unknown` if the line is missing from the alarm configuration. +As with the `class` and `type` line, if `component` is missing from the configuration, its value will default to `Unknown`. #### Alarm line `os` diff --git a/health/health.c b/health/health.c index 85d2a2458..d8e1d4b77 100644 --- a/health/health.c +++ b/health/health.c @@ -230,6 +230,9 @@ void health_reload(void) { if (netdata_cloud_setting) { aclk_single_update_enable(); aclk_alarm_reload(); +#ifdef ENABLE_NEW_CLOUD_PROTOCOL + aclk_alert_reloaded = 1; +#endif } #endif } @@ -308,26 +311,44 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { int n_warn=0, n_crit=0; RRDCALC *rc; EVAL_EXPRESSION *expr=NULL; + BUFFER *warn_alarms, *crit_alarms; + + warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); + crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); for(rc = host->alarms; rc ; rc = rc->next) { if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) continue; - if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) { - n_warn++; - if (ae->alarm_id == rc->id) - expr=rc->warning; + if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) { + if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) { + if (n_warn) + buffer_strcat(warn_alarms, ","); + buffer_strcat(warn_alarms, rc->name); + buffer_strcat(warn_alarms, "="); + buffer_snprintf(warn_alarms, 11, "%ld", rc->last_status_change); + n_warn++; + } else if (ae->alarm_id == rc->id) + expr = rc->warning; } else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) { - n_crit++; - if (ae->alarm_id == rc->id) - expr=rc->critical; + if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) { + if (n_crit) + buffer_strcat(crit_alarms, ","); + buffer_strcat(crit_alarms, rc->name); + buffer_strcat(crit_alarms, "="); + buffer_snprintf(crit_alarms, 11, "%ld", rc->last_status_change); + n_crit++; + } else if (ae->alarm_id == rc->id) + expr = rc->critical; } else if (unlikely(rc->status == RRDCALC_STATUS_CLEAR)) { if (ae->alarm_id == rc->id) - expr=rc->warning; + expr = rc->warning; } } - snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d'", + char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0"); + + snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'", exec, recipient, host->registry_hostname, @@ -352,7 +373,12 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { (expr && expr->source)?expr->source:"NOSOURCE", (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG", n_warn, - n_crit + n_crit, + buffer_tostring(warn_alarms), + buffer_tostring(crit_alarms), + ae->classification?ae->classification:"Unknown", + edit_command, + localhost->registry_hostname ); ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN; @@ -363,6 +389,10 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { ae->exec_spawn_serial = spawn_enq_cmd(command_to_run); enqueue_alarm_notify_in_progress(ae); + freez(edit_command); + buffer_free(warn_alarms); + buffer_free(crit_alarms); + return; //health_alarm_wait_for_execution done: health_alarm_log_save(host, ae); @@ -635,6 +665,8 @@ void *health_main(void *ptr) { int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10); if(min_run_every < 1) min_run_every = 1; + int cleanup_sql_every_loop = 7200 / min_run_every; + time_t now = now_realtime_sec(); time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60); @@ -689,6 +721,9 @@ void *health_main(void *ptr) { host->health_delay_up_to = 0; } + if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0)) + sql_health_alarm_log_cleanup(host); + rrdhost_rdlock(host); // the first loop is to lookup values from the db @@ -929,7 +964,7 @@ void *health_main(void *ptr) { if(likely(!rrdcalc_isrepeating(rc))) { ALARM_ENTRY *ae = health_create_alarm_entry( - host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, + host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, rc->delay_last, @@ -979,7 +1014,7 @@ void *health_main(void *ptr) { if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) { rc->last_repeat = now; ALARM_ENTRY *ae = health_create_alarm_entry( - host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, + host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info, rc->delay_last, @@ -1003,6 +1038,14 @@ void *health_main(void *ptr) { rrdhost_unlock(host); } +#ifdef ENABLE_ACLK +#ifdef ENABLE_NEW_CLOUD_PROTOCOL + if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > 2) { + sql_queue_removed_alerts_to_aclk(host); + } +#endif +#endif + if (unlikely(netdata_exit)) break; @@ -1027,8 +1070,12 @@ void *health_main(void *ptr) { health_alarm_wait_for_execution(ae); } - rrd_unlock(); +#ifdef ENABLE_NEW_CLOUD_PROTOCOL + if (netdata_cloud_setting && unlikely(aclk_alert_reloaded)) + aclk_alert_reloaded = 0; +#endif + rrd_unlock(); if(unlikely(netdata_exit)) break; diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf index b067e1840..1d823addd 100644 --- a/health/health.d/adaptec_raid.conf +++ b/health/health.d/adaptec_raid.conf @@ -3,9 +3,9 @@ template: adaptec_raid_ld_status on: adaptec_raid.ld_status - class: System + class: Errors + type: System component: RAID - type: Errors lookup: max -10s foreach * units: bool every: 10s @@ -18,9 +18,9 @@ component: RAID template: adaptec_raid_pd_state on: adaptec_raid.pd_state - class: System + class: Errors + type: System component: RAID - type: Errors lookup: max -10s foreach * units: bool every: 10s diff --git a/health/health.d/am2320.conf b/health/health.d/am2320.conf deleted file mode 100644 index 4bac98fbb..000000000 --- a/health/health.d/am2320.conf +++ /dev/null @@ -1,15 +0,0 @@ -# make sure am2320 is sending stats - - template: am2320_last_collected_secs - on: am2320.temperature - class: Other -component: Sensors - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf index f27e39fc1..269ae544b 100644 --- a/health/health.d/anomalies.conf +++ b/health/health.d/anomalies.conf @@ -2,9 +2,9 @@ template: anomalies_anomaly_probabilities on: anomalies.probability - class: Netdata + class: Errors + type: Netdata component: ML - type: Errors lookup: average -2m foreach * every: 1m warn: $this > 50 @@ -14,9 +14,9 @@ component: ML template: anomalies_anomaly_flags on: anomalies.anomaly - class: Netdata + class: Errors + type: Netdata component: ML - type: Errors lookup: sum -2m foreach * every: 1m warn: $this > 10 diff --git a/health/health.d/apache.conf b/health/health.d/apache.conf deleted file mode 100644 index c623fb880..000000000 --- a/health/health.d/apache.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure apache is running - - template: apache_last_collected_secs - on: apache.requests - class: Web Server -component: Apache - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf index 07b5c28c9..65f1a69ab 100644 --- a/health/health.d/apcupsd.conf +++ b/health/health.d/apcupsd.conf @@ -2,9 +2,9 @@ template: apcupsd_10min_ups_load on: apcupsd.load - class: Power Supply + class: Utilization + type: Power Supply component: UPS - type: Utilization os: * hosts: * lookup: average -10m unaligned of percentage @@ -20,9 +20,9 @@ component: UPS # Fire the alarm as soon as it's going on battery (99% charge) and clear only when full. template: apcupsd_ups_charge on: apcupsd.charge - class: Power Supply + class: Errors + type: Power Supply component: UPS - type: Errors os: * hosts: * lookup: average -60s unaligned of charge @@ -36,9 +36,9 @@ component: UPS template: apcupsd_last_collected_secs on: apcupsd.load - class: Power Supply + class: Latency + type: Power Supply component: UPS device - type: Latency calc: $now - $last_collected_t every: 10s units: seconds ago diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf index 948ea551a..91d469395 100644 --- a/health/health.d/backend.conf +++ b/health/health.d/backend.conf @@ -1,9 +1,9 @@ # Alert that backends subsystem will be disabled soon alarm: backend_metrics_eol on: netdata.backend_metrics - class: Netdata + class: Errors + type: Netdata component: Exporting engine - type: Errors units: boolean calc: $now - $last_collected_t every: 1m @@ -16,9 +16,9 @@ component: Exporting engine alarm: backend_last_buffering on: netdata.backend_metrics - class: Netdata + class: Latency + type: Netdata component: Exporting engine - type: Latency calc: $now - $last_collected_t units: seconds ago every: 10s @@ -30,9 +30,9 @@ component: Exporting engine alarm: backend_metrics_sent on: netdata.backend_metrics - class: Netdata + class: Workload + type: Netdata component: Exporting engine - type: Workload units: % calc: abs($sent) * 100 / abs($buffered) every: 10s diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf index d75d8e19b..49cb5ad0f 100644 --- a/health/health.d/bcache.conf +++ b/health/health.d/bcache.conf @@ -1,9 +1,9 @@ template: bcache_cache_errors on: disk.bcache_cache_read_races - class: System + class: Errors + type: System component: Disk - type: Errors lookup: sum -1m unaligned absolute units: errors every: 1m @@ -16,9 +16,9 @@ component: Disk template: bcache_cache_dirty on: disk.bcache_cache_alloc - class: System + class: Utilization + type: System component: Disk - type: Utilization calc: $dirty + $metadata + $undefined units: % every: 1m diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf index 99c754571..13ac8c182 100644 --- a/health/health.d/beanstalkd.conf +++ b/health/health.d/beanstalkd.conf @@ -2,9 +2,9 @@ template: beanstalk_server_buried_jobs on: beanstalk.current_jobs - class: Messaging + class: Workload + type: Messaging component: Beanstalk - type: Workload calc: $buried units: jobs every: 10s diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf index e88f87a4f..7c09225ff 100644 --- a/health/health.d/bind_rndc.conf +++ b/health/health.d/bind_rndc.conf @@ -1,8 +1,8 @@ template: bind_rndc_stats_file_size on: bind_rndc.stats_size - class: DNS + class: Utilization + type: DNS component: BIND - type: Utilization units: megabytes every: 60 calc: $stats_size diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf index 8604abee9..7d7a4fdae 100644 --- a/health/health.d/boinc.conf +++ b/health/health.d/boinc.conf @@ -3,9 +3,9 @@ # Warn on any compute errors encountered. template: boinc_compute_errors on: boinc.states - class: Computing + class: Errors + type: Computing component: BOINC - type: Errors os: * hosts: * families: * @@ -21,9 +21,9 @@ component: BOINC # Warn on lots of upload errors template: boinc_upload_errors on: boinc.states - class: Computing + class: Errors + type: Computing component: BOINC - type: Errors os: * hosts: * families: * @@ -39,9 +39,9 @@ component: BOINC # Warn on the task queue being empty template: boinc_total_tasks on: boinc.tasks - class: Computing + class: Utilization + type: Computing component: BOINC - type: Utilization os: * hosts: * families: * @@ -57,9 +57,9 @@ component: BOINC # Warn on no active tasks with a non-empty queue template: boinc_active_tasks on: boinc.tasks - class: Computing + class: Utilization + type: Computing component: BOINC - type: Utilization os: * hosts: * families: * diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf index d3200a7ee..8d197aa8d 100644 --- a/health/health.d/btrfs.conf +++ b/health/health.d/btrfs.conf @@ -1,9 +1,9 @@ template: btrfs_allocated on: btrfs.disk - class: System + class: Utilization + type: System component: File system - type: Utilization os: * hosts: * families: * @@ -18,9 +18,9 @@ component: File system template: btrfs_data on: btrfs.data - class: System + class: Utilization + type: System component: File system - type: Utilization os: * hosts: * families: * @@ -35,9 +35,9 @@ component: File system template: btrfs_metadata on: btrfs.metadata - class: System + class: Utilization + type: System component: File system - type: Utilization os: * hosts: * families: * @@ -52,9 +52,9 @@ component: File system template: btrfs_system on: btrfs.system - class: System + class: Utilization + type: System component: File system - type: Utilization os: * hosts: * families: * diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf index ed8f9b4b9..1f9da25c7 100644 --- a/health/health.d/ceph.conf +++ b/health/health.d/ceph.conf @@ -2,9 +2,9 @@ template: ceph_cluster_space_usage on: ceph.general_usage - class: Storage + class: Utilization + type: Storage component: Ceph - type: Utilization calc: $used * 100 / ($used + $avail) units: % every: 1m diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf index 068533f10..45b34806c 100644 --- a/health/health.d/cgroups.conf +++ b/health/health.d/cgroups.conf @@ -3,9 +3,9 @@ template: cgroup_10min_cpu_usage on: cgroup.cpu_limit - class: Cgroups + class: Utilization + type: Cgroups component: CPU - type: Utilization os: linux hosts: * lookup: average -10m unaligned @@ -19,9 +19,9 @@ component: CPU template: cgroup_ram_in_use on: cgroup.mem_usage - class: Cgroups + class: Utilization + type: Cgroups component: Memory - type: Utilization os: linux hosts: * calc: ($ram) * 100 / $memory_limit diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf index dccd2b064..1f227841e 100644 --- a/health/health.d/cockroachdb.conf +++ b/health/health.d/cockroachdb.conf @@ -1,27 +1,11 @@ -# Availability - - template: cockroachdb_last_collected_secs - on: cockroachdb.live_nodes - class: Database -component: CockroachDB - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba - # Capacity template: cockroachdb_used_storage_capacity on: cockroachdb.storage_used_capacity_percentage - class: Database + class: Utilization + type: Database component: CockroachDB - type: Utilization calc: $capacity_used_percent units: % every: 10s @@ -33,9 +17,9 @@ component: CockroachDB template: cockroachdb_used_usable_storage_capacity on: cockroachdb.storage_used_capacity_percentage - class: Database + class: Utilization + type: Database component: CockroachDB - type: Utilization calc: $capacity_usable_used_percent units: % every: 10s @@ -49,37 +33,37 @@ component: CockroachDB template: cockroachdb_unavailable_ranges on: cockroachdb.ranges_replication_problem - class: Database + class: Errors + type: Database component: CockroachDB - type: Utilization calc: $ranges_unavailable units: num every: 10s warn: $this > 0 delay: down 15m multiplier 1.5 max 1h - info: number of ranges with fewer live replicas than the replication target + info: number of ranges with fewer live replicas than needed for quorum to: dba - template: cockroachdb_replicas_leaders_not_leaseholders - on: cockroachdb.replicas_leaders - class: Database + template: cockroachdb_underreplicated_ranges + on: cockroachdb.ranges_replication_problem + class: Errors + type: Database component: CockroachDB - type: Utilization - calc: $replicas_leaders_not_leaseholders + calc: $ranges_underreplicated units: num every: 10s warn: $this > 0 delay: down 15m multiplier 1.5 max 1h - info: number of replicas that are Raft leaders whose range lease is held by another store + info: number of ranges with fewer live replicas than the replication target to: dba # FD template: cockroachdb_open_file_descriptors_limit on: cockroachdb.process_file_descriptors - class: Database + class: Utilization + type: Database component: CockroachDB - type: Utilization calc: $sys_fd_open/$sys_fd_softlimit * 100 units: % every: 10s @@ -87,29 +71,3 @@ component: CockroachDB delay: down 15m multiplier 1.5 max 1h info: open file descriptors utilization (against softlimit) to: dba - -# SQL - - template: cockroachdb_sql_active_connections - on: cockroachdb.sql_connections - class: Database -component: CockroachDB - type: Utilization - calc: $sql_conns - units: active connections - every: 10s - info: number of active SQL connections - to: dba - - template: cockroachdb_sql_executed_statements_total_last_5m - on: cockroachdb.sql_statements_total - class: Database -component: CockroachDB - type: Workload - lookup: sum -5m absolute of sql_query_count - units: statements - every: 10s - warn: $this == 0 AND $cockroachdb_sql_active_connections != 0 - delay: down 15m up 30s multiplier 1.5 max 1h - info: number of executed SQL statements in the last 5 minutes - to: dba diff --git a/health/health.d/couchdb.conf b/health/health.d/couchdb.conf deleted file mode 100644 index c86c6b988..000000000 --- a/health/health.d/couchdb.conf +++ /dev/null @@ -1,16 +0,0 @@ - -# make sure couchdb is running - - template: couchdb_last_collected_secs - on: couchdb.request_methods - class: Database -component: CouchDB - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf index d11215768..ad6952825 100644 --- a/health/health.d/cpu.conf +++ b/health/health.d/cpu.conf @@ -3,9 +3,9 @@ template: 10min_cpu_usage on: system.cpu - class: System + class: Utilization + type: System component: CPU - type: Utilization os: linux hosts: * lookup: average -10m unaligned of user,system,softirq,irq,guest @@ -19,9 +19,9 @@ component: CPU template: 10min_cpu_iowait on: system.cpu - class: System + class: Utilization + type: System component: CPU - type: Utilization os: linux hosts: * lookup: average -10m unaligned of iowait @@ -35,9 +35,9 @@ component: CPU template: 20min_steal_cpu on: system.cpu - class: System + class: Latency + type: System component: CPU - type: Latency os: linux hosts: * lookup: average -20m unaligned of steal @@ -52,9 +52,9 @@ component: CPU ## FreeBSD template: 10min_cpu_usage on: system.cpu - class: System + class: Utilization + type: System component: CPU - type: Utilization os: freebsd hosts: * lookup: average -10m unaligned of user,system,interrupt diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf index 79c156ab8..65c41b846 100644 --- a/health/health.d/dbengine.conf +++ b/health/health.d/dbengine.conf @@ -3,9 +3,9 @@ alarm: 10min_dbengine_global_fs_errors on: netdata.dbengine_global_errors - class: Netdata + class: Errors + type: Netdata component: DB engine - type: Errors os: linux freebsd macos hosts: * lookup: sum -10m unaligned of fs_errors @@ -18,9 +18,9 @@ component: DB engine alarm: 10min_dbengine_global_io_errors on: netdata.dbengine_global_errors - class: Netdata + class: Errors + type: Netdata component: DB engine - type: Errors os: linux freebsd macos hosts: * lookup: sum -10m unaligned of io_errors @@ -33,9 +33,9 @@ component: DB engine alarm: 10min_dbengine_global_flushing_warnings on: netdata.dbengine_global_errors - class: Netdata + class: Errors + type: Netdata component: DB engine - type: Errors os: linux freebsd macos hosts: * lookup: sum -10m unaligned of pg_cache_over_half_dirty_events @@ -49,9 +49,9 @@ component: DB engine alarm: 10min_dbengine_global_flushing_errors on: netdata.dbengine_long_term_page_stats - class: Netdata + class: Errors + type: Netdata component: DB engine - type: Errors os: linux freebsd macos hosts: * lookup: sum -10m unaligned of flushing_pressure_deletions diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf index 60f8faed9..5daff61a1 100644 --- a/health/health.d/disks.conf +++ b/health/health.d/disks.conf @@ -11,9 +11,9 @@ template: disk_space_usage on: disk.space - class: System + class: Utilization + type: System component: Disk - type: Utilization os: linux freebsd hosts: * families: !/dev !/dev/* !/run !/run/* * @@ -28,9 +28,9 @@ component: Disk template: disk_inode_usage on: disk.inodes - class: System + class: Utilization + type: System component: Disk - type: Utilization os: linux freebsd hosts: * families: !/dev !/dev/* !/run !/run/* * @@ -136,19 +136,16 @@ component: Disk template: 10min_disk_utilization on: disk.util - class: System + class: Utilization + type: System component: Disk - type: Utilization os: linux freebsd hosts: * families: * lookup: average -10m unaligned units: % every: 1m - green: 90 - red: 98 - warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1)) - crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1)) + warn: $this > 98 * (($status >= $WARNING) ? (0.7) : (1)) delay: down 15m multiplier 1.2 max 1h info: average percentage of time $family disk was busy over the last 10 minutes to: silent @@ -161,19 +158,16 @@ component: Disk template: 10min_disk_backlog on: disk.backlog - class: System + class: Latency + type: System component: Disk - type: Latency os: linux hosts: * families: * lookup: average -10m unaligned units: ms every: 1m - green: 2000 - red: 5000 - warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1)) - crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1)) + warn: $this > 5000 * (($status >= $WARNING) ? (0.7) : (1)) delay: down 15m multiplier 1.2 max 1h info: average backlog size of the $family disk over the last 10 minutes to: silent diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf index 1fbb2c598..ec4937c0a 100644 --- a/health/health.d/dns_query.conf +++ b/health/health.d/dns_query.conf @@ -3,9 +3,9 @@ template: dns_query_time_query_time on: dns_query_time.query_time - class: DNS + class: Latency + type: DNS component: DNS - type: Latency lookup: average -10s unaligned foreach * units: ms every: 10s diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf index 10d139f77..010b94599 100644 --- a/health/health.d/dnsmasq_dhcp.conf +++ b/health/health.d/dnsmasq_dhcp.conf @@ -2,9 +2,9 @@ template: dnsmasq_dhcp_dhcp_range_utilization on: dnsmasq_dhcp.dhcp_range_utilization - class: DHCP + class: Utilization + type: DHCP component: Dnsmasq - type: Utilization every: 10s units: % calc: $used diff --git a/health/health.d/dockerd.conf b/health/health.d/dockerd.conf index ba866f81b..220ddd664 100644 --- a/health/health.d/dockerd.conf +++ b/health/health.d/dockerd.conf @@ -1,8 +1,8 @@ template: docker_unhealthy_containers on: docker.unhealthy_containers - class: Containers + class: Errors + type: Containers component: Docker - type: Errors units: unhealthy containers every: 10s lookup: average -10s diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf deleted file mode 100644 index 05d576c39..000000000 --- a/health/health.d/elasticsearch.conf +++ /dev/null @@ -1,15 +0,0 @@ - -# make sure elasticsearch is running - - template: elasticsearch_last_collected - on: elasticsearch.cluster_health_status - class: Search engine -component: Elasticsearch - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - info: number of seconds since the last successful data collection - to: sysadmin diff --git a/health/health.d/entropy.conf b/health/health.d/entropy.conf index 0478fa0be..13b0fcde4 100644 --- a/health/health.d/entropy.conf +++ b/health/health.d/entropy.conf @@ -5,9 +5,9 @@ alarm: lowest_entropy on: system.entropy - class: System + class: Utilization + type: System component: Cryptography - type: Utilization os: linux hosts: * lookup: min -5m unaligned diff --git a/health/health.d/exporting.conf b/health/health.d/exporting.conf index 4430f3fd8..06f398c6e 100644 --- a/health/health.d/exporting.conf +++ b/health/health.d/exporting.conf @@ -1,22 +1,25 @@ -template: exporting_last_buffering -families: * - on: exporting_data_size - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful buffering of exporting data - to: dba + template: exporting_last_buffering + families: * + on: exporting_data_size + class: Latency + type: Netdata +component: Exporting engine + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful buffering of exporting data + to: dba template: exporting_metrics_sent families: * on: exporting_data_size - class: Netdata + class: Workload + type: Netdata component: Exporting engine - type: Workload units: % calc: abs($sent) * 100 / abs($buffered) every: 10s diff --git a/health/health.d/fping.conf b/health/health.d/fping.conf index 120fe8f28..bb22419fa 100644 --- a/health/health.d/fping.conf +++ b/health/health.d/fping.conf @@ -2,9 +2,9 @@ template: fping_last_collected_secs families: * on: fping.latency - class: Other + class: Latency + type: Other component: Network - type: Latency calc: $now - $last_collected_t units: seconds ago every: 10s @@ -17,9 +17,9 @@ component: Network template: fping_host_reachable families: * on: fping.latency - class: Other + class: Errors + type: Other component: Network - type: Errors calc: $average != nan units: up/down every: 10s @@ -31,9 +31,9 @@ component: Network template: fping_host_latency families: * on: fping.latency - class: Other + class: Latency + type: Other component: Network - type: Latency lookup: average -10s unaligned of average units: ms every: 10s @@ -48,9 +48,9 @@ component: Network template: fping_packet_loss families: * on: fping.quality - class: System + class: Errors + type: System component: Network - type: Errors lookup: average -10m unaligned of returned calc: 100 - $this green: 1 diff --git a/health/health.d/fronius.conf b/health/health.d/fronius.conf index 81aafaa60..853bd7fbc 100644 --- a/health/health.d/fronius.conf +++ b/health/health.d/fronius.conf @@ -1,9 +1,9 @@ template: fronius_last_collected_secs families: * on: fronius.power - class: Power Supply + class: Latency + type: Power Supply component: Solar - type: Latency calc: $now - $last_collected_t every: 10s units: seconds ago diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf index e2031bf2b..14010d445 100644 --- a/health/health.d/gearman.conf +++ b/health/health.d/gearman.conf @@ -1,24 +1,10 @@ -# make sure Gearman is running - template: gearman_last_collected_secs - on: gearman.total_jobs - class: Computing -component: Gearman - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin template: gearman_workers_queued on: gearman.single_job - class: Computing + class: Latency + type: Computing component: Gearman - type: Latency - lookup: average -10m unaligned match-names of Queued + lookup: average -10m unaligned match-names of Pending units: workers every: 10s warn: $this > 30000 diff --git a/health/health.d/geth.conf b/health/health.d/geth.conf new file mode 100644 index 000000000..dd1eb4701 --- /dev/null +++ b/health/health.d/geth.conf @@ -0,0 +1,12 @@ +#chainhead_header is expected momenterarily to be ahead. If its considerably ahead (e.g more than 5 blocks), then the node is definitely out of sync. + template: geth_chainhead_diff_between_header_block + on: geth.chainhead + class: Workload + type: ethereum_node +component: geth + every: 10s + calc: $chain_head_block - $chain_head_header + units: blocks + warn: $this != 0 + crit: $this > 5 + delay: down 1m multiplier 1.5 max 1h diff --git a/health/health.d/go.d.plugin.conf b/health/health.d/go.d.plugin.conf new file mode 100644 index 000000000..8bf84a976 --- /dev/null +++ b/health/health.d/go.d.plugin.conf @@ -0,0 +1,17 @@ + +# make sure go.d.plugin data collection job is running + + template: go.d_job_last_collected_secs + on: netdata.go_plugin_execution_time + class: Error + type: Netdata +component: go.d.plugin + module: * + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster diff --git a/health/health.d/haproxy.conf b/health/health.d/haproxy.conf index 9f6b1c577..a0ab52bca 100644 --- a/health/health.d/haproxy.conf +++ b/health/health.d/haproxy.conf @@ -1,8 +1,8 @@ template: haproxy_backend_server_status on: haproxy_hs.down - class: Web Proxy + class: Errors + type: Web Proxy component: HAProxy - type: Errors units: failed servers every: 10s lookup: average -10s @@ -12,25 +12,12 @@ component: HAProxy template: haproxy_backend_status on: haproxy_hb.down - class: Web Proxy + class: Errors + type: Web Proxy component: HAProxy - type: Errors units: failed backend every: 10s lookup: average -10s crit: $this > 0 info: average number of failed haproxy backends over the last 10 seconds to: sysadmin - - template: haproxy_last_collected - on: haproxy_hb.down - class: Web Proxy -component: HAProxy - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - info: number of seconds since the last successful data collection - to: sysadmin diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf index bd8308bed..ca8df31b9 100644 --- a/health/health.d/hdfs.conf +++ b/health/health.d/hdfs.conf @@ -1,28 +1,11 @@ -# make sure hdfs is running - - template: hdfs_last_collected_secs - on: hdfs.heap_memory - class: Storage -component: HDFS - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - - # Common template: hdfs_capacity_usage on: hdfs.capacity - class: Storage + class: Utilization + type: Storage component: HDFS - type: Utilization calc: ($used) * 100 / ($used + $remaining) units: % every: 10s @@ -37,9 +20,9 @@ component: HDFS template: hdfs_missing_blocks on: hdfs.blocks - class: Storage + class: Errors + type: Storage component: HDFS - type: Errors calc: $missing units: missing blocks every: 10s @@ -51,9 +34,9 @@ component: HDFS template: hdfs_stale_nodes on: hdfs.data_nodes - class: Storage + class: Errors + type: Storage component: HDFS - type: Errors calc: $stale units: dead nodes every: 10s @@ -65,9 +48,9 @@ component: HDFS template: hdfs_dead_nodes on: hdfs.data_nodes - class: Storage + class: Errors + type: Storage component: HDFS - type: Errors calc: $dead units: dead nodes every: 10s @@ -81,9 +64,9 @@ component: HDFS template: hdfs_num_failed_volumes on: hdfs.num_failed_volumes - class: Storage + class: Errors + type: Storage component: HDFS - type: Errors calc: $fsds_num_failed_volumes units: failed volumes every: 10s diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf index d4d6376a3..599c47acc 100644 --- a/health/health.d/httpcheck.conf +++ b/health/health.d/httpcheck.conf @@ -1,25 +1,11 @@ - template: httpcheck_last_collected_secs - families: * - on: httpcheck.status - class: Other -component: HTTP endpoint - type: Latency - calc: $now - $last_collected_t - every: 10s - units: seconds ago - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges template: httpcheck_web_service_up families: * on: httpcheck.status - class: Web Server + class: Utilization + type: Web Server component: HTTP endpoint - type: Utilization lookup: average -1m unaligned percentage of success calc: ($this < 75) ? (0) : ($this) every: 5s @@ -30,9 +16,9 @@ component: HTTP endpoint template: httpcheck_web_service_bad_content families: * on: httpcheck.status - class: Web Server + class: Workload + type: Web Server component: HTTP endpoint - type: Workload lookup: average -5m unaligned percentage of bad_content every: 10s units: % @@ -46,9 +32,9 @@ component: HTTP endpoint template: httpcheck_web_service_bad_status families: * on: httpcheck.status - class: Web Server + class: Workload + type: Web Server component: HTTP endpoint - type: Workload lookup: average -5m unaligned percentage of bad_status every: 10s units: % @@ -62,9 +48,9 @@ component: HTTP endpoint template: httpcheck_web_service_timeouts families: * on: httpcheck.status - class: Web Server + class: Latency + type: Web Server component: HTTP endpoint - type: Latency lookup: average -5m unaligned percentage of timeout every: 10s units: % @@ -73,9 +59,9 @@ component: HTTP endpoint template: httpcheck_no_web_service_connections families: * on: httpcheck.status - class: Other + class: Errors + type: Other component: HTTP endpoint - type: Errors lookup: average -5m unaligned percentage of no_connection every: 10s units: % @@ -85,9 +71,9 @@ component: HTTP endpoint template: httpcheck_web_service_unreachable families: * on: httpcheck.status - class: Web Server + class: Errors + type: Web Server component: HTTP endpoint - type: Errors calc: ($httpcheck_no_web_service_connections >= $httpcheck_web_service_timeouts) ? ($httpcheck_no_web_service_connections) : ($httpcheck_web_service_timeouts) units: % every: 10s @@ -101,9 +87,9 @@ component: HTTP endpoint template: httpcheck_1h_web_service_response_time families: * on: httpcheck.responsetime - class: Other + class: Latency + type: Other component: HTTP endpoint - type: Latency lookup: average -1h unaligned of time every: 30s units: ms @@ -112,9 +98,9 @@ component: HTTP endpoint template: httpcheck_web_service_slow families: * on: httpcheck.responsetime - class: Web Server + class: Latency + type: Web Server component: HTTP endpoint - type: Latency lookup: average -3m unaligned of time units: ms every: 10s diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf index 57ce4e866..ee4befbea 100644 --- a/health/health.d/ioping.conf +++ b/health/health.d/ioping.conf @@ -1,9 +1,9 @@ template: ioping_disk_latency families: * on: ioping.latency - class: System + class: Latency + type: System component: Disk - type: Latency lookup: average -10s unaligned of average units: ms every: 10s diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf index 6eaf7abe9..c178a410a 100644 --- a/health/health.d/ipc.conf +++ b/health/health.d/ipc.conf @@ -3,9 +3,9 @@ alarm: semaphores_used on: system.ipc_semaphores - class: System + class: Utilization + type: System component: IPC - type: Utilization os: linux hosts: * calc: $semaphores * 100 / $ipc_semaphores_max @@ -19,9 +19,9 @@ component: IPC alarm: semaphore_arrays_used on: system.ipc_semaphore_arrays - class: System + class: Utilization + type: System component: IPC - type: Utilization os: linux hosts: * calc: $arrays * 100 / $ipc_semaphores_arrays_max diff --git a/health/health.d/ipfs.conf b/health/health.d/ipfs.conf index 6268f4092..a514ddfd0 100644 --- a/health/health.d/ipfs.conf +++ b/health/health.d/ipfs.conf @@ -1,9 +1,9 @@ template: ipfs_datastore_usage on: ipfs.repo_size - class: Data Sharing + class: Utilization + type: Data Sharing component: IPFS - type: Utilization calc: $size * 100 / $avail units: % every: 10s diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf index d4fdc6c79..feadba1b7 100644 --- a/health/health.d/ipmi.conf +++ b/health/health.d/ipmi.conf @@ -1,8 +1,8 @@ alarm: ipmi_sensors_states on: ipmi.sensors_states - class: System + class: Errors + type: System component: IPMI - type: Errors calc: $warning + $critical units: sensors every: 10s @@ -14,9 +14,9 @@ component: IPMI alarm: ipmi_events on: ipmi.events - class: System + class: Utilization + type: System component: IPMI - type: Utilization calc: $events units: events every: 10s diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf index 4d3c45f97..c2778cc5e 100644 --- a/health/health.d/kubelet.conf +++ b/health/health.d/kubelet.conf @@ -6,9 +6,9 @@ template: kubelet_node_config_error on: k8s_kubelet.kubelet_node_config_error - class: Kubernetes + class: Errors + type: Kubernetes component: Kubelet - type: Errors calc: $kubelet_node_config_error units: bool every: 10s @@ -22,9 +22,9 @@ component: Kubelet template: kubelet_token_requests lookup: sum -10s of token_fail_count on: k8s_kubelet.kubelet_token_requests - class: Kubernetes + class: Errors + type: Kubernetes component: Kubelet - type: Errors units: failed requests every: 10s warn: $this > 0 @@ -37,9 +37,9 @@ component: Kubelet template: kubelet_operations_error lookup: sum -1m on: k8s_kubelet.kubelet_operations_errors - class: Kubernetes + class: Errors + type: Kubernetes component: Kubelet - type: Errors units: errors every: 10s warn: $this > (($status >= $WARNING) ? (0) : (20)) @@ -64,9 +64,9 @@ component: Kubelet template: kubelet_1m_pleg_relist_latency_quantile_05 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Kubernetes + class: Latency + type: Kubernetes component: Kubelet - type: Latency lookup: average -1m unaligned of kubelet_pleg_relist_latency_05 units: microseconds every: 10s @@ -74,9 +74,9 @@ component: Kubelet template: kubelet_10s_pleg_relist_latency_quantile_05 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Kubernetes + class: Latency + type: Kubernetes component: Kubelet - type: Latency lookup: average -10s unaligned of kubelet_pleg_relist_latency_05 calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05)) every: 10s @@ -92,9 +92,9 @@ component: Kubelet template: kubelet_1m_pleg_relist_latency_quantile_09 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Kubernetes + class: Latency + type: Kubernetes component: Kubelet - type: Latency lookup: average -1m unaligned of kubelet_pleg_relist_latency_09 units: microseconds every: 10s @@ -102,9 +102,9 @@ component: Kubelet template: kubelet_10s_pleg_relist_latency_quantile_09 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Kubernetes + class: Latency + type: Kubernetes component: Kubelet - type: Latency lookup: average -10s unaligned of kubelet_pleg_relist_latency_09 calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09)) every: 10s @@ -120,9 +120,9 @@ component: Kubelet template: kubelet_1m_pleg_relist_latency_quantile_099 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Kubernetes + class: Latency + type: Kubernetes component: Kubelet - type: Latency lookup: average -1m unaligned of kubelet_pleg_relist_latency_099 units: microseconds every: 10s @@ -130,9 +130,9 @@ component: Kubelet template: kubelet_10s_pleg_relist_latency_quantile_099 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Kubernetes + class: Latency + type: Kubernetes component: Kubelet - type: Latency lookup: average -10s unaligned of kubelet_pleg_relist_latency_099 calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099)) every: 10s diff --git a/health/health.d/lighttpd.conf b/health/health.d/lighttpd.conf deleted file mode 100644 index 0f067549e..000000000 --- a/health/health.d/lighttpd.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure lighttpd is running - - template: lighttpd_last_collected_secs - on: lighttpd.requests - class: Web Server -component: Lighttpd - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf index e28c246a3..c0bc6de8a 100644 --- a/health/health.d/linux_power_supply.conf +++ b/health/health.d/linux_power_supply.conf @@ -2,9 +2,9 @@ template: linux_power_supply_capacity on: powersupply.capacity - class: Power Supply + class: Utilization + type: Power Supply component: Battery - type: Utilization calc: $capacity units: % every: 10s diff --git a/health/health.d/load.conf b/health/health.d/load.conf index e811f6ee2..0bd872f85 100644 --- a/health/health.d/load.conf +++ b/health/health.d/load.conf @@ -6,9 +6,9 @@ # minute, with a special case for a single CPU of setting the trigger at 2. alarm: load_cpu_number on: system.load - class: System + class: Utilization + type: System component: Load - type: Utilization os: linux hosts: * calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors ) @@ -22,9 +22,9 @@ component: Load alarm: load_average_15 on: system.load - class: System + class: Utilization + type: System component: Load - type: Utilization os: linux hosts: * lookup: max -1m unaligned of load15 @@ -37,9 +37,9 @@ component: Load alarm: load_average_5 on: system.load - class: System + class: Utilization + type: System component: Load - type: Utilization os: linux hosts: * lookup: max -1m unaligned of load5 @@ -52,9 +52,9 @@ component: Load alarm: load_average_1 on: system.load - class: System + class: Utilization + type: System component: Load - type: Utilization os: linux hosts: * lookup: max -1m unaligned of load1 diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf index 67483b201..cedaa000e 100644 --- a/health/health.d/mdstat.conf +++ b/health/health.d/mdstat.conf @@ -1,8 +1,8 @@ template: mdstat_last_collected on: md.disks - class: System + class: Latency + type: System component: RAID - type: Latency calc: $now - $last_collected_t units: seconds ago every: 10s @@ -13,9 +13,9 @@ component: RAID template: mdstat_disks on: md.disks - class: System + class: Errors + type: System component: RAID - type: Errors units: failed devices every: 10s calc: $down @@ -26,9 +26,9 @@ component: RAID template: mdstat_mismatch_cnt on: md.mismatch_cnt - class: System + class: Errors + type: System component: RAID - type: Errors families: !*(raid1) !*(raid10) * units: unsynchronized blocks calc: $count @@ -40,9 +40,9 @@ component: RAID template: mdstat_nonredundant_last_collected on: md.nonredundant - class: System + class: Latency + type: System component: RAID - type: Latency calc: $now - $last_collected_t units: seconds ago every: 10s diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf index 1b6502f62..9fbcfdb92 100644 --- a/health/health.d/megacli.conf +++ b/health/health.d/megacli.conf @@ -3,9 +3,9 @@ template: megacli_adapter_state on: megacli.adapter_degraded - class: System + class: Errors + type: System component: RAID - type: Errors lookup: max -10s foreach * units: boolean every: 10s @@ -18,9 +18,9 @@ component: RAID template: megacli_pd_predictive_failures on: megacli.pd_predictive_failure - class: System + class: Errors + type: System component: RAID - type: Errors lookup: sum -10s foreach * units: predictive failures every: 10s @@ -31,9 +31,9 @@ component: RAID template: megacli_pd_media_errors on: megacli.pd_media_error - class: System + class: Errors + type: System component: RAID - type: Errors lookup: sum -10s foreach * units: media errors every: 10s @@ -46,9 +46,9 @@ component: RAID template: megacli_bbu_relative_charge on: megacli.bbu_relative_charge - class: System + class: Workload + type: System component: RAID - type: Workload lookup: average -10s units: percent every: 10s @@ -59,9 +59,9 @@ component: RAID template: megacli_bbu_cycle_count on: megacli.bbu_cycle_count - class: System + class: Workload + type: System component: RAID - type: Workload lookup: average -10s units: cycles every: 10s diff --git a/health/health.d/memcached.conf b/health/health.d/memcached.conf index f4b734c38..2a2fe4b82 100644 --- a/health/health.d/memcached.conf +++ b/health/health.d/memcached.conf @@ -1,28 +1,11 @@ -# make sure memcached is running - - template: memcached_last_collected_secs - on: memcached.cache - class: KV Storage -component: Memcached - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba - - # detect if memcached cache is full template: memcached_cache_memory_usage on: memcached.cache - class: KV Storage + class: Utilization + type: KV Storage component: Memcached - type: Utilization calc: $used * 100 / ($used + $available) units: % every: 10s @@ -37,9 +20,9 @@ component: Memcached template: memcached_cache_fill_rate on: memcached.cache - class: KV Storage + class: Utilization + type: KV Storage component: Memcached - type: Utilization lookup: min -10m at -50m unaligned of available calc: ($this - $available) / (($now - $after) / 3600) units: KB/hour @@ -51,9 +34,9 @@ component: Memcached template: memcached_out_of_cache_space_time on: memcached.cache - class: KV Storage + class: Utilization + type: KV Storage component: Memcached - type: Utilization calc: ($memcached_cache_fill_rate > 0) ? ($available / $memcached_cache_fill_rate) : (inf) units: hours every: 10s diff --git a/health/health.d/memory.conf b/health/health.d/memory.conf index ab651315f..010cbbd7b 100644 --- a/health/health.d/memory.conf +++ b/health/health.d/memory.conf @@ -3,9 +3,9 @@ alarm: 1hour_ecc_memory_correctable on: mem.ecc_ce - class: System + class: Errors + type: System component: Memory - type: Errors os: linux hosts: * lookup: sum -10m unaligned @@ -18,9 +18,9 @@ component: Memory alarm: 1hour_ecc_memory_uncorrectable on: mem.ecc_ue - class: System + class: Errors + type: System component: Memory - type: Errors os: linux hosts: * lookup: sum -10m unaligned @@ -33,9 +33,9 @@ component: Memory alarm: 1hour_memory_hw_corrupted on: mem.hwcorrupt - class: System + class: Errors + type: System component: Memory - type: Errors os: linux hosts: * calc: $HardwareCorrupted diff --git a/health/health.d/mongodb.conf b/health/health.d/mongodb.conf deleted file mode 100644 index 8c9bdeb6f..000000000 --- a/health/health.d/mongodb.conf +++ /dev/null @@ -1,16 +0,0 @@ - -# make sure mongodb is running - - template: mongodb_last_collected_secs - on: mongodb.read_operations - class: Database -component: MongoDB - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf index 91860c4a7..34452d983 100644 --- a/health/health.d/mysql.conf +++ b/health/health.d/mysql.conf @@ -1,29 +1,11 @@ -# make sure mysql is running - - template: mysql_last_collected_secs - on: mysql.queries - class: Database -component: MySQL - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba - - -# ----------------------------------------------------------------------------- # slow queries template: mysql_10s_slow_queries on: mysql.queries - class: Database + class: Latency + type: Database component: MySQL - type: Latency lookup: sum -10s of slow_queries units: slow queries every: 10s @@ -39,9 +21,9 @@ component: MySQL template: mysql_10s_table_locks_immediate on: mysql.table_locks - class: Database + class: Utilization + type: Database component: MySQL - type: Utilization lookup: sum -10s absolute of immediate units: immediate locks every: 10s @@ -50,9 +32,9 @@ component: MySQL template: mysql_10s_table_locks_waited on: mysql.table_locks - class: Database + class: Latency + type: Database component: MySQL - type: Latency lookup: sum -10s absolute of waited units: waited locks every: 10s @@ -61,9 +43,9 @@ component: MySQL template: mysql_10s_waited_locks_ratio on: mysql.table_locks - class: Database + class: Latency + type: Database component: MySQL - type: Latency calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0 units: % every: 10s @@ -79,9 +61,9 @@ component: MySQL template: mysql_connections on: mysql.connections_active - class: Database + class: Utilization + type: Database component: MySQL - type: Utilization calc: $active * 100 / $limit units: % every: 10s @@ -97,9 +79,9 @@ component: MySQL template: mysql_replication on: mysql.slave_status - class: Database + class: Errors + type: Database component: MySQL - type: Errors calc: ($sql_running <= 0 OR $io_running <= 0)?0:1 units: ok/failed every: 10s @@ -110,9 +92,9 @@ component: MySQL template: mysql_replication_lag on: mysql.slave_behind - class: Database + class: Latency + type: Database component: MySQL - type: Errors calc: $seconds units: seconds every: 10s @@ -129,9 +111,9 @@ component: MySQL template: mysql_galera_cluster_size_max_2m on: mysql.galera_cluster_size - class: Database + class: Utilization + type: Database component: MySQL - type: Utilization lookup: max -2m absolute units: nodes every: 10s @@ -140,9 +122,9 @@ component: MySQL template: mysql_galera_cluster_size on: mysql.galera_cluster_size - class: Database + class: Utilization + type: Database component: MySQL - type: Utilization calc: $nodes units: nodes every: 10s @@ -156,9 +138,9 @@ component: MySQL template: mysql_galera_cluster_state on: mysql.galera_cluster_state - class: Database + class: Errors + type: Database component: MySQL - type: Errors calc: $state every: 10s warn: $this == 2 OR $this == 3 @@ -173,9 +155,9 @@ component: MySQL template: mysql_galera_cluster_status on: mysql.galera_cluster_status - class: Database + class: Errors + type: Database component: MySQL - type: Errors calc: $wsrep_cluster_status every: 10s crit: $mysql_galera_cluster_state != nan AND $this != 0 diff --git a/health/health.d/named.conf b/health/health.d/named.conf deleted file mode 100644 index 90266df16..000000000 --- a/health/health.d/named.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure named is running - - template: named_last_collected_secs - on: named.global_queries - class: DNS -component: BIND - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: domainadmin - diff --git a/health/health.d/net.conf b/health/health.d/net.conf index 04219e163..028ca7b81 100644 --- a/health/health.d/net.conf +++ b/health/health.d/net.conf @@ -6,9 +6,9 @@ template: interface_speed on: net.net - class: System + class: Latency + type: System component: Network - type: Latency os: * hosts: * families: * @@ -19,9 +19,9 @@ component: Network template: 1m_received_traffic_overflow on: net.net - class: System + class: Workload + type: System component: Network - type: Workload os: linux hosts: * families: * @@ -36,9 +36,9 @@ component: Network template: 1m_sent_traffic_overflow on: net.net - class: System + class: Workload + type: System component: Network - type: Workload os: linux hosts: * families: * @@ -63,9 +63,9 @@ component: Network template: inbound_packets_dropped on: net.drops - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * families: !net* * @@ -76,9 +76,9 @@ component: Network template: outbound_packets_dropped on: net.drops - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * families: !net* * @@ -89,14 +89,14 @@ component: Network template: inbound_packets_dropped_ratio on: net.packets - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * families: !net* !wl* * lookup: sum -10m unaligned absolute of received - calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0)) + calc: (($inbound_packets_dropped != nan AND $this > 10000) ? ($inbound_packets_dropped * 100 / $this) : (0)) units: % every: 1m warn: $this >= 2 @@ -106,9 +106,9 @@ component: Network template: outbound_packets_dropped_ratio on: net.packets - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * families: !net* !wl* * @@ -123,14 +123,14 @@ component: Network template: wifi_inbound_packets_dropped_ratio on: net.packets - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * families: wl* lookup: sum -10m unaligned absolute of received - calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0)) + calc: (($inbound_packets_dropped != nan AND $this > 10000) ? ($inbound_packets_dropped * 100 / $this) : (0)) units: % every: 1m warn: $this >= 10 @@ -140,9 +140,9 @@ component: Network template: wifi_outbound_packets_dropped_ratio on: net.packets - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * families: wl* @@ -160,9 +160,9 @@ component: Network template: interface_inbound_errors on: net.errors - class: System + class: Errors + type: System component: Network - type: Errors os: freebsd hosts: * families: * @@ -176,9 +176,9 @@ component: Network template: interface_outbound_errors on: net.errors - class: System + class: Errors + type: System component: Network - type: Errors os: freebsd hosts: * families: * @@ -200,9 +200,9 @@ component: Network template: 10min_fifo_errors on: net.fifo - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * families: * @@ -225,9 +225,9 @@ component: Network template: 1m_received_packets_rate on: net.packets - class: System + class: Workload + type: System component: Network - type: Workload os: linux freebsd hosts: * families: * @@ -238,9 +238,9 @@ component: Network template: 10s_received_packets_storm on: net.packets - class: System + class: Workload + type: System component: Network - type: Workload os: linux freebsd hosts: * families: * diff --git a/health/health.d/netfilter.conf b/health/health.d/netfilter.conf index 35c89caf7..7de383fa2 100644 --- a/health/health.d/netfilter.conf +++ b/health/health.d/netfilter.conf @@ -3,9 +3,9 @@ alarm: netfilter_conntrack_full on: netfilter.conntrack_sockets - class: System + class: Workload + type: System component: Network - type: Workload os: linux hosts: * lookup: max -10s unaligned of connections diff --git a/health/health.d/nginx.conf b/health/health.d/nginx.conf deleted file mode 100644 index 30c738f47..000000000 --- a/health/health.d/nginx.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure nginx is running - - template: nginx_last_collected_secs - on: nginx.requests - class: Web Server -component: NGINX - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - diff --git a/health/health.d/nginx_plus.conf b/health/health.d/nginx_plus.conf deleted file mode 100644 index 5849a9e7e..000000000 --- a/health/health.d/nginx_plus.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure nginx_plus is running - - template: nginx_plus_last_collected_secs - on: nginx_plus.requests_total - class: Web Server -component: NGINX Plus - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - diff --git a/health/health.d/phpfpm.conf b/health/health.d/phpfpm.conf deleted file mode 100644 index fc073a944..000000000 --- a/health/health.d/phpfpm.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure phpfpm is running - - template: phpfpm_last_collected_secs - on: phpfpm.requests - class: Web Server -component: PHP-FPM - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf index 72622caed..2e5c1cbfd 100644 --- a/health/health.d/pihole.conf +++ b/health/health.d/pihole.conf @@ -1,45 +1,12 @@ -# Make sure Pi-hole is responding. - - template: pihole_last_collected_secs - on: pihole.dns_queries_total - class: Ad Filtering -component: Pi-hole - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - -# Blocked DNS queries. - - template: pihole_blocked_queries - on: pihole.dns_queries_percentage - class: Ad Filtering -component: Pi-hole - type: Errors - every: 10s - units: % - calc: $blocked - warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) ) - crit: $this > ( ($status == $CRITICAL) ? ( 55 ) : ( 75 ) ) - delay: up 2m down 5m - info: percentage of blocked dns queries over the last 24 hour - to: sysadmin - - # Blocklist last update time. # Default update interval is a week. template: pihole_blocklist_last_update on: pihole.blocklist_last_update - class: Ad Filtering + class: Errors + type: Ad Filtering component: Pi-hole - type: Errors every: 10s units: seconds calc: $ago @@ -52,15 +19,15 @@ component: Pi-hole template: pihole_blocklist_gravity_file on: pihole.blocklist_last_update - class: Ad Filtering + class: Errors + type: Ad Filtering component: Pi-hole - type: Errors every: 10s units: boolean calc: $file_exists crit: $this != 1 delay: up 2m down 5m - info: gravity.list (blocklist) file existence state (0: exists, 1: not-exists) + info: gravity.list (blocklist) file existence state (0: not-exists, 1: exists) to: sysadmin # Pi-hole's ability to block unwanted domains. @@ -68,13 +35,13 @@ component: Pi-hole template: pihole_status on: pihole.unwanted_domains_blocking_status - class: Ad Filtering + class: Errors + type: Ad Filtering component: Pi-hole - type: Errors every: 10s units: boolean calc: $enabled warn: $this != 1 delay: up 2m down 5m - info: unwanted domains blocking status (0: enabled, 1: disabled) + info: unwanted domains blocking status (0: disabled, 1: enabled) to: sysadmin diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf index b977dbb31..8cbd7729c 100644 --- a/health/health.d/portcheck.conf +++ b/health/health.d/portcheck.conf @@ -1,25 +1,11 @@ - template: portcheck_last_collected_secs - families: * - on: portcheck.status - class: Other -component: TCP endpoint - type: Latency - calc: $now - $last_collected_t - every: 10s - units: seconds ago - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges template: portcheck_service_reachable families: * on: portcheck.status - class: Other + class: Workload + type: Other component: TCP endpoint - type: Workload lookup: average -1m unaligned percentage of success calc: ($this < 75) ? (0) : ($this) every: 5s @@ -30,9 +16,9 @@ component: TCP endpoint template: portcheck_connection_timeouts families: * on: portcheck.status - class: Other + class: Errors + type: Other component: TCP endpoint - type: Errors lookup: average -5m unaligned percentage of timeout every: 10s units: % @@ -45,9 +31,9 @@ component: TCP endpoint template: portcheck_connection_fails families: * on: portcheck.status - class: Other + class: Errors + type: Other component: TCP endpoint - type: Errors lookup: average -5m unaligned percentage of no_connection,failed every: 10s units: % diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf deleted file mode 100644 index f908a802a..000000000 --- a/health/health.d/postgres.conf +++ /dev/null @@ -1,16 +0,0 @@ - -# make sure postgres is running - - template: postgres_last_collected_secs - on: postgres.db_stat_transactions - class: Database -component: PostgreSQL - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf index b44a24c0b..2929ee3d4 100644 --- a/health/health.d/processes.conf +++ b/health/health.d/processes.conf @@ -2,9 +2,9 @@ alarm: active_processes on: system.active_processes - class: System + class: Workload + type: System component: Processes - type: Workload hosts: * calc: $active * 100 / $pidmax units: % diff --git a/health/health.d/pulsar.conf b/health/health.d/pulsar.conf deleted file mode 100644 index 9903d4e38..000000000 --- a/health/health.d/pulsar.conf +++ /dev/null @@ -1,16 +0,0 @@ - -# Availability - - template: pulsar_last_collected_secs - on: pulsar.broker_components - class: Messaging -component: Pulsar - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin diff --git a/health/health.d/python.d.plugin.conf b/health/health.d/python.d.plugin.conf new file mode 100644 index 000000000..f3abc588f --- /dev/null +++ b/health/health.d/python.d.plugin.conf @@ -0,0 +1,17 @@ + +# make sure python.d.plugin data collection job is running + + template: python.d_job_last_collected_secs + on: netdata.pythond_runtime + class: Error + type: Netdata +component: python.d.plugin + module: * + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf index 0e3cc29fa..6e6e3b400 100644 --- a/health/health.d/ram.conf +++ b/health/health.d/ram.conf @@ -3,9 +3,9 @@ alarm: used_ram_to_ignore on: system.ram - class: System + class: Utilization + type: System component: Memory - type: Utilization os: linux freebsd hosts: * calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz - $zfs.arc_size.min) @@ -15,13 +15,12 @@ component: Memory alarm: ram_in_use on: system.ram - class: System + class: Utilization + type: System component: Memory - type: Utilization os: linux hosts: * -# calc: $used * 100 / ($used + $cached + $free) - calc: ($used - $used_ram_to_ignore) * 100 / ($used + $cached + $free) + calc: ($used - $used_ram_to_ignore) * 100 / ($used + $cached + $free + $buffers) units: % every: 10s warn: $this > (($status >= $WARNING) ? (80) : (90)) @@ -32,12 +31,12 @@ component: Memory alarm: ram_available on: mem.available - class: System + class: Utilization + type: System component: Memory - type: Utilization os: linux hosts: * - calc: ($avail + $system.ram.used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers) + calc: $avail * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers) units: % every: 10s warn: $this < (($status >= $WARNING) ? (15) : (10)) @@ -46,24 +45,25 @@ component: Memory info: percentage of estimated amount of RAM available for userspace processes, without causing swapping to: sysadmin - alarm: oom_kill - on: mem.oom_kill - os: linux - hosts: * - lookup: sum -1m unaligned - units: kills - every: 10s - warn: $this > 0 - delay: down 5m - info: number of out of memory kills in the last minute - to: sysadmin + alarm: oom_kill + on: mem.oom_kill + os: linux + hosts: * + lookup: sum -30m unaligned + units: kills + every: 5m + warn: $this > 0 + delay: down 10m +host labels: _is_k8s_node = false + info: number of out of memory kills in the last 30 minutes + to: sysadmin ## FreeBSD alarm: ram_in_use on: system.ram - class: System + class: Utilization + type: System component: Memory - type: Utilization os: freebsd hosts: * calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive) @@ -77,9 +77,9 @@ component: Memory alarm: ram_available on: system.ram - class: System + class: Utilization + type: System component: Memory - type: Utilization os: freebsd hosts: * calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers) diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf index e8b289942..dfb771e8c 100644 --- a/health/health.d/redis.conf +++ b/health/health.d/redis.conf @@ -1,26 +1,10 @@ -# make sure redis is running - - template: redis_last_collected_secs - on: redis.operations - class: KV Storage -component: Redis - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba - template: redis_bgsave_broken families: * on: redis.bgsave_health - class: KV Storage + class: Errors + type: KV Storage component: Redis - type: Errors every: 10s crit: $rdb_last_bgsave_status != 0 units: ok/failed @@ -31,9 +15,9 @@ component: Redis template: redis_bgsave_slow families: * on: redis.bgsave_now - class: KV Storage + class: Latency + type: KV Storage component: Redis - type: Latency every: 10s warn: $rdb_bgsave_in_progress > 600 crit: $rdb_bgsave_in_progress > 1200 diff --git a/health/health.d/retroshare.conf b/health/health.d/retroshare.conf index ca22e60de..14aa76b4c 100644 --- a/health/health.d/retroshare.conf +++ b/health/health.d/retroshare.conf @@ -1,26 +1,11 @@ -# make sure RetroShare is running - - template: retroshare_last_collected_secs - on: retroshare.peers - class: Data Sharing -component: Retroshare - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin # make sure the DHT is fine when active template: retroshare_dht_working on: retroshare.dht - class: Data Sharing + class: Utilization + type: Data Sharing component: Retroshare - type: Utilization calc: $dht_size_all units: peers every: 1m diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf index b2c0e8d9c..261fd48c6 100644 --- a/health/health.d/riakkv.conf +++ b/health/health.d/riakkv.conf @@ -1,24 +1,10 @@ -# Ensure that Riak is running. template: riak_last_collected_secs - template: riakkv_last_collected_secs - on: riak.kv.throughput - class: Database -component: Riak KV - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba # Warn if a list keys operation is running. template: riakkv_list_keys_active on: riak.core.fsm_active - class: Database + class: Utilization + type: Database component: Riak KV - type: Utilization calc: $list_fsm_active units: state machines every: 10s @@ -31,9 +17,9 @@ component: Riak KV # KV GET template: riakkv_1h_kv_get_mean_latency on: riak.kv.latency.get - class: Database + class: Latency + type: Database component: Riak KV - type: Latency calc: $node_get_fsm_time_mean lookup: average -1h unaligned of time every: 30s @@ -43,9 +29,9 @@ component: Riak KV template: riakkv_kv_get_slow on: riak.kv.latency.get - class: Database + class: Latency + type: Database component: Riak KV - type: Latency calc: $mean lookup: average -3m unaligned of time units: ms @@ -61,9 +47,9 @@ component: Riak KV # KV PUT template: riakkv_1h_kv_put_mean_latency on: riak.kv.latency.put - class: Database + class: Latency + type: Database component: Riak KV - type: Latency calc: $node_put_fsm_time_mean lookup: average -1h unaligned of time every: 30s @@ -73,9 +59,9 @@ component: Riak KV template: riakkv_kv_put_slow on: riak.kv.latency.put - class: Database + class: Latency + type: Database component: Riak KV - type: Latency calc: $mean lookup: average -3m unaligned of time units: ms @@ -95,9 +81,9 @@ component: Riak KV # On systems observed, this is < 2000, but may grow depending on load. template: riakkv_vm_high_process_count on: riak.vm - class: Database + class: Utilization + type: Database component: Riak KV - type: Utilization calc: $sys_process_count units: processes every: 10s diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf index 3c0dc1168..ab110bf07 100644 --- a/health/health.d/scaleio.conf +++ b/health/health.d/scaleio.conf @@ -1,27 +1,11 @@ -# make sure scaleio is running - - template: scaleio_last_collected_secs - on: scaleio.system_capacity_total - class: Storage -component: ScaleIO - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - # make sure Storage Pool capacity utilization is under limit template: scaleio_storage_pool_capacity_utilization on: scaleio.storage_pool_capacity_utilization - class: Storage + class: Utilization + type: Storage component: ScaleIO - type: Utilization calc: $used units: % every: 10s @@ -36,9 +20,9 @@ component: ScaleIO template: scaleio_sdc_mdm_connection_state on: scaleio.sdc_mdm_connection_state - class: Storage + class: Utilization + type: Storage component: ScaleIO - type: Utilization calc: $connected every: 10s warn: $this != 1 diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf index d8b01caff..345f87505 100644 --- a/health/health.d/softnet.conf +++ b/health/health.d/softnet.conf @@ -5,9 +5,9 @@ alarm: 1min_netdev_backlog_exceeded on: system.softnet_stat - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * lookup: average -1m unaligned absolute of dropped @@ -21,9 +21,9 @@ component: Network alarm: 1min_netdev_budget_ran_outs on: system.softnet_stat - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * lookup: average -1m unaligned absolute of squeezed @@ -38,9 +38,9 @@ component: Network alarm: 10min_netisr_backlog_exceeded on: system.softnet_stat - class: System + class: Errors + type: System component: Network - type: Errors os: freebsd hosts: * lookup: average -1m unaligned absolute of qdrops diff --git a/health/health.d/squid.conf b/health/health.d/squid.conf deleted file mode 100644 index 5c3d17629..000000000 --- a/health/health.d/squid.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure squid is running - - template: squid_last_collected_secs - on: squid.clients_requests - class: Web Proxy -component: Squid - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: proxyadmin - diff --git a/health/health.d/stiebeleltron.conf b/health/health.d/stiebeleltron.conf index f793b5ed1..493c8b73a 100644 --- a/health/health.d/stiebeleltron.conf +++ b/health/health.d/stiebeleltron.conf @@ -1,9 +1,9 @@ template: stiebeleltron_last_collected_secs families: * on: stiebeleltron.heating.hc1 - class: Other + class: Latency + type: Other component: Sensors - type: Latency calc: $now - $last_collected_t every: 10s units: seconds ago diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf index 5b3f89a97..03c319320 100644 --- a/health/health.d/swap.conf +++ b/health/health.d/swap.conf @@ -3,9 +3,9 @@ alarm: 30min_ram_swapped_out on: system.swapio - class: System + class: Workload + type: System component: Memory - type: Workload os: linux freebsd hosts: * lookup: sum -30m unaligned absolute of out @@ -20,12 +20,12 @@ component: Memory alarm: used_swap on: system.swap - class: System + class: Utilization + type: System component: Memory - type: Utilization os: linux freebsd hosts: * - calc: $used * 100 / ( $used + $free ) + calc: ($used + $free) > 0 ? ($used * 100 / ($used + $free)) : 0 units: % every: 10s warn: $this > (($status >= $WARNING) ? (80) : (90)) diff --git a/health/health.d/systemdunits.conf b/health/health.d/systemdunits.conf index cc1a8698d..38213a8db 100644 --- a/health/health.d/systemdunits.conf +++ b/health/health.d/systemdunits.conf @@ -4,9 +4,9 @@ ## Service units template: systemd_service_units_state on: systemd.service_units_state - class: Linux + class: Errors + type: Linux component: Systemd units - type: Errors lookup: max -1s min2max units: ok/failed every: 10s @@ -18,9 +18,9 @@ component: Systemd units ## Socket units template: systemd_socket_units_state on: systemd.socket_unit_state - class: Linux + class: Errors + type: Linux component: Systemd units - type: Errors lookup: max -1s min2max units: ok/failed every: 10s @@ -32,9 +32,9 @@ component: Systemd units ## Target units template: systemd_target_units_state on: systemd.target_unit_state - class: Linux + class: Errors + type: Linux component: Systemd units - type: Errors lookup: max -1s min2max units: ok/failed every: 10s @@ -46,9 +46,9 @@ component: Systemd units ## Path units template: systemd_path_units_state on: systemd.path_unit_state - class: Linux + class: Errors + type: Linux component: Systemd units - type: Errors lookup: max -1s min2max units: ok/failed every: 10s @@ -60,9 +60,9 @@ component: Systemd units ## Device units template: systemd_device_units_state on: systemd.device_unit_state - class: Linux + class: Errors + type: Linux component: Systemd units - type: Errors lookup: max -1s min2max units: ok/failed every: 10s @@ -74,9 +74,9 @@ component: Systemd units ## Mount units template: systemd_mount_units_state on: systemd.mount_unit_state - class: Linux + class: Errors + type: Linux component: Systemd units - type: Errors lookup: max -1s min2max units: ok/failed every: 10s @@ -88,9 +88,9 @@ component: Systemd units ## Automount units template: systemd_automount_units_state on: systemd.automount_unit_state - class: Linux + class: Errors + type: Linux component: Systemd units - type: Errors lookup: max -1s min2max units: ok/failed every: 10s @@ -102,9 +102,9 @@ component: Systemd units ## Swap units template: systemd_swap_units_state on: systemd.swap_unit_state - class: Linux + class: Errors + type: Linux component: Systemd units - type: Errors lookup: max -1s min2max units: ok/failed every: 10s @@ -116,9 +116,9 @@ component: Systemd units ## Scope units template: systemd_scope_units_state on: systemd.scope_unit_state - class: Linux + class: Errors + type: Linux component: Systemd units - type: Errors lookup: max -1s min2max units: ok/failed every: 10s @@ -130,9 +130,9 @@ component: Systemd units ## Slice units template: systemd_slice_units_state on: systemd.slice_unit_state - class: Linux + class: Errors + type: Linux component: Systemd units - type: Errors lookup: max -1s min2max units: ok/failed every: 10s diff --git a/health/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf index f2c5e4e5d..67b3bee53 100644 --- a/health/health.d/tcp_conn.conf +++ b/health/health.d/tcp_conn.conf @@ -7,9 +7,9 @@ alarm: tcp_connections on: ipv4.tcpsock - class: System + class: Workload + type: System component: Network - type: Workload os: linux hosts: * calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0 diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf index 51a0e461c..d4bcfa248 100644 --- a/health/health.d/tcp_listen.conf +++ b/health/health.d/tcp_listen.conf @@ -20,9 +20,9 @@ alarm: 1m_tcp_accept_queue_overflows on: ip.tcp_accept_queue - class: System + class: Workload + type: System component: Network - type: Workload os: linux hosts: * lookup: average -60s unaligned absolute of ListenOverflows @@ -38,9 +38,9 @@ component: Network # CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842 alarm: 1m_tcp_accept_queue_drops on: ip.tcp_accept_queue - class: System + class: Workload + type: System component: Network - type: Workload os: linux hosts: * lookup: average -60s unaligned absolute of ListenDrops @@ -63,9 +63,9 @@ component: Network alarm: 1m_tcp_syn_queue_drops on: ip.tcp_syn_queue - class: System + class: Workload + type: System component: Network - type: Workload os: linux hosts: * lookup: average -60s unaligned absolute of TCPReqQFullDrop @@ -80,9 +80,9 @@ component: Network alarm: 1m_tcp_syn_queue_cookies on: ip.tcp_syn_queue - class: System + class: Workload + type: System component: Network - type: Workload os: linux hosts: * lookup: average -60s unaligned absolute of TCPReqQFullDoCookies diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf index 646e5c6da..318be20ac 100644 --- a/health/health.d/tcp_mem.conf +++ b/health/health.d/tcp_mem.conf @@ -8,9 +8,9 @@ alarm: tcp_memory on: ipv4.sockstat_tcp_mem - class: System + class: Utilization + type: System component: Network - type: Utilization os: linux hosts: * calc: ${mem} * 100 / ${tcp_mem_high} diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf index 6e94d67d1..cbd628da5 100644 --- a/health/health.d/tcp_orphans.conf +++ b/health/health.d/tcp_orphans.conf @@ -9,9 +9,9 @@ alarm: tcp_orphans on: ipv4.sockstat_tcp_sockets - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * calc: ${orphan} * 100 / ${tcp_max_orphans} diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf index 41355dad6..190271e47 100644 --- a/health/health.d/tcp_resets.conf +++ b/health/health.d/tcp_resets.conf @@ -6,9 +6,9 @@ alarm: 1m_ipv4_tcp_resets_sent on: ipv4.tcphandshake - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * lookup: average -1m at -10s unaligned absolute of OutRsts @@ -18,9 +18,9 @@ component: Network alarm: 10s_ipv4_tcp_resets_sent on: ipv4.tcphandshake - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * lookup: average -10s unaligned absolute of OutRsts @@ -40,9 +40,9 @@ component: Network alarm: 1m_ipv4_tcp_resets_received on: ipv4.tcphandshake - class: System + class: Errors + type: System component: Network - type: Errors os: linux freebsd hosts: * lookup: average -1m at -10s unaligned absolute of AttemptFails @@ -52,9 +52,9 @@ component: Network alarm: 10s_ipv4_tcp_resets_received on: ipv4.tcphandshake - class: System + class: Errors + type: System component: Network - type: Errors os: linux freebsd hosts: * lookup: average -10s unaligned absolute of AttemptFails diff --git a/health/health.d/timex.conf b/health/health.d/timex.conf new file mode 100644 index 000000000..ea90c4000 --- /dev/null +++ b/health/health.d/timex.conf @@ -0,0 +1,17 @@ + +# It can take several minutes before ntpd selects a server to synchronize with; +# try checking after 17 minutes (1024 seconds). + + alarm: system_clock_sync_state + on: system.clock_sync_state + os: linux + class: Error + type: System +component: Clock + calc: $state + units: synchronization state + every: 10s + warn: $system.uptime.uptime > 17 * 60 AND $this == 0 + delay: down 5m + info: the system time is not synchronized to a reliable server + to: silent diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf index 342a1aedd..64f47dfa7 100644 --- a/health/health.d/udp_errors.conf +++ b/health/health.d/udp_errors.conf @@ -6,9 +6,9 @@ alarm: 1m_ipv4_udp_receive_buffer_errors on: ipv4.udperrors - class: System + class: Errors + type: System component: Network - type: Errors os: linux freebsd hosts: * lookup: average -1m unaligned absolute of RcvbufErrors @@ -24,9 +24,9 @@ component: Network alarm: 1m_ipv4_udp_send_buffer_errors on: ipv4.udperrors - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * lookup: average -1m unaligned absolute of SndbufErrors diff --git a/health/health.d/unbound.conf b/health/health.d/unbound.conf index 1df15474f..4e8d164d2 100644 --- a/health/health.d/unbound.conf +++ b/health/health.d/unbound.conf @@ -1,27 +1,11 @@ -# make sure unbound is running - - template: unbound_last_collected_secs - on: unbound.queries - class: DNS -component: Unbound - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - # make sure there is no overwritten/dropped queries in the request-list template: unbound_request_list_overwritten on: unbound.request_list_jostle_list - class: DNS + class: Errors + type: DNS component: Unbound - type: Errors lookup: average -60s unaligned absolute match-names of overwritten units: queries every: 10s @@ -32,9 +16,9 @@ component: Unbound template: unbound_request_list_dropped on: unbound.request_list_jostle_list - class: DNS + class: Errors + type: DNS component: Unbound - type: Errors lookup: average -60s unaligned absolute match-names of dropped units: queries every: 10s diff --git a/health/health.d/varnish.conf b/health/health.d/varnish.conf deleted file mode 100644 index 7f3bd6c82..000000000 --- a/health/health.d/varnish.conf +++ /dev/null @@ -1,12 +0,0 @@ - alarm: varnish_last_collected - on: varnish.uptime - class: Web Proxy -component: Varnish - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - info: number of seconds since the last successful data collection - to: sysadmin diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf index 8538e488c..a9cc7ceef 100644 --- a/health/health.d/vcsa.conf +++ b/health/health.d/vcsa.conf @@ -1,20 +1,4 @@ -# make sure vcsa is running and responding - - template: vcsa_last_collected_secs - on: vcsa.system_health - class: Virtual Machine -component: VMware vCenter - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - # Overall system health: # - 0: all components are healthy. # - 1: one or more components might become overloaded soon. @@ -24,9 +8,9 @@ component: VMware vCenter template: vcsa_system_health on: vcsa.system_health - class: Virtual Machine + class: Errors + type: Virtual Machine component: VMware vCenter - type: Errors lookup: max -10s unaligned of system units: status every: 10s @@ -46,9 +30,9 @@ component: VMware vCenter template: vcsa_swap_health on: vcsa.components_health - class: Virtual Machine + class: Errors + type: Virtual Machine component: VMware vCenter - type: Errors lookup: max -10s unaligned of swap units: status every: 10s @@ -61,9 +45,9 @@ component: VMware vCenter template: vcsa_storage_health on: vcsa.components_health - class: Virtual Machine + class: Errors + type: Virtual Machine component: VMware vCenter - type: Errors lookup: max -10s unaligned of storage units: status every: 10s @@ -76,9 +60,9 @@ component: VMware vCenter template: vcsa_mem_health on: vcsa.components_health - class: Virtual Machine + class: Errors + type: Virtual Machine component: VMware vCenter - type: Errors lookup: max -10s unaligned of mem units: status every: 10s @@ -91,9 +75,9 @@ component: VMware vCenter template: vcsa_load_health on: vcsa.components_health - class: Virtual Machine + class: Utilization + type: Virtual Machine component: VMware vCenter - type: Utilization lookup: max -10s unaligned of load units: status every: 10s @@ -106,9 +90,9 @@ component: VMware vCenter template: vcsa_database_storage_health on: vcsa.components_health - class: Virtual Machine + class: Errors + type: Virtual Machine component: VMware vCenter - type: Errors lookup: max -10s unaligned of database_storage units: status every: 10s @@ -121,9 +105,9 @@ component: VMware vCenter template: vcsa_applmgmt_health on: vcsa.components_health - class: Virtual Machine + class: Errors + type: Virtual Machine component: VMware vCenter - type: Errors lookup: max -10s unaligned of applmgmt units: status every: 10s @@ -143,9 +127,9 @@ component: VMware vCenter template: vcsa_software_updates_health on: vcsa.software_updates_health - class: Virtual Machine + class: Errors + type: Virtual Machine component: VMware vCenter - type: Errors lookup: max -10s unaligned of software_packages units: status every: 10s diff --git a/health/health.d/vernemq.conf b/health/health.d/vernemq.conf index 737147f38..cfbe2a524 100644 --- a/health/health.d/vernemq.conf +++ b/health/health.d/vernemq.conf @@ -1,27 +1,11 @@ -# Availability - - template: vernemq_last_collected_secs - on: vernemq.node_uptime - class: Messaging -component: VerneMQ - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - # Socket errors template: vernemq_socket_errors on: vernemq.socket_errors - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: sum -1m unaligned absolute of socket_error units: errors every: 1m @@ -34,9 +18,9 @@ component: VerneMQ template: vernemq_queue_message_drop on: vernemq.queue_undelivered_messages - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute of queue_message_drop units: dropped messages every: 1m @@ -47,9 +31,9 @@ component: VerneMQ template: vernemq_queue_message_expired on: vernemq.queue_undelivered_messages - class: Messaging + class: Latency + type: Messaging component: VerneMQ - type: Latency lookup: average -1m unaligned absolute of queue_message_expired units: expired messages every: 1m @@ -60,9 +44,9 @@ component: VerneMQ template: vernemq_queue_message_unhandled on: vernemq.queue_undelivered_messages - class: Messaging + class: Latency + type: Messaging component: VerneMQ - type: Latency lookup: average -1m unaligned absolute of queue_message_unhandled units: unhandled messages every: 1m @@ -75,9 +59,9 @@ component: VerneMQ template: vernemq_average_scheduler_utilization on: vernemq.average_scheduler_utilization - class: Messaging + class: Utilization + type: Messaging component: VerneMQ - type: Utilization lookup: average -10m unaligned units: % every: 1m @@ -91,9 +75,9 @@ component: VerneMQ template: vernemq_cluster_dropped on: vernemq.cluster_dropped - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: sum -1m unaligned units: KiB every: 1m @@ -104,9 +88,9 @@ component: VerneMQ template: vernemq_netsplits on: vernemq.netsplits - class: Messaging + class: Workload + type: Messaging component: VerneMQ - type: Workload lookup: sum -1m unaligned absolute of netsplit_detected units: netsplits every: 10s @@ -119,9 +103,9 @@ component: VerneMQ template: vernemq_mqtt_connack_sent_reason_unsuccessful on: vernemq.mqtt_connack_sent_reason - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute match-names of !success,* units: packets every: 1m @@ -134,9 +118,9 @@ component: VerneMQ template: vernemq_mqtt_disconnect_received_reason_not_normal on: vernemq.mqtt_disconnect_received_reason - class: Messaging + class: Workload + type: Messaging component: VerneMQ - type: Workload lookup: average -1m unaligned absolute match-names of !normal_disconnect,* units: packets every: 1m @@ -147,9 +131,9 @@ component: VerneMQ template: vernemq_mqtt_disconnect_sent_reason_not_normal on: vernemq.mqtt_disconnect_sent_reason - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute match-names of !normal_disconnect,* units: packets every: 1m @@ -162,9 +146,9 @@ component: VerneMQ template: vernemq_mqtt_subscribe_error on: vernemq.mqtt_subscribe_error - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute units: failed ops every: 1m @@ -175,9 +159,9 @@ component: VerneMQ template: vernemq_mqtt_subscribe_auth_error on: vernemq.mqtt_subscribe_auth_error - class: Messaging + class: Workload + type: Messaging component: VerneMQ - type: Workload lookup: average -1m unaligned absolute units: attempts every: 1m @@ -190,9 +174,9 @@ component: VerneMQ template: vernemq_mqtt_unsubscribe_error on: vernemq.mqtt_unsubscribe_error - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute units: failed ops every: 1m @@ -205,9 +189,9 @@ component: VerneMQ template: vernemq_mqtt_publish_errors on: vernemq.mqtt_publish_errors - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute units: failed ops every: 1m @@ -218,9 +202,9 @@ component: VerneMQ template: vernemq_mqtt_publish_auth_errors on: vernemq.mqtt_publish_auth_errors - class: Messaging + class: Workload + type: Messaging component: VerneMQ - type: Workload lookup: average -1m unaligned absolute units: attempts every: 1m @@ -233,9 +217,9 @@ component: VerneMQ template: vernemq_mqtt_puback_received_reason_unsuccessful on: vernemq.mqtt_puback_received_reason - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute match-names of !success,* units: packets every: 1m @@ -246,9 +230,9 @@ component: VerneMQ template: vernemq_mqtt_puback_sent_reason_unsuccessful on: vernemq.mqtt_puback_sent_reason - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute match-names of !success,* units: packets every: 1m @@ -259,9 +243,9 @@ component: VerneMQ template: vernemq_mqtt_puback_unexpected on: vernemq.mqtt_puback_invalid_error - class: Messaging + class: Workload + type: Messaging component: VerneMQ - type: Workload lookup: average -1m unaligned absolute units: messages every: 1m @@ -274,9 +258,9 @@ component: VerneMQ template: vernemq_mqtt_pubrec_received_reason_unsuccessful on: vernemq.mqtt_pubrec_received_reason - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute match-names of !success,* units: packets every: 1m @@ -287,9 +271,9 @@ component: VerneMQ template: vernemq_mqtt_pubrec_sent_reason_unsuccessful on: vernemq.mqtt_pubrec_sent_reason - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute match-names of !success,* units: packets every: 1m @@ -300,9 +284,9 @@ component: VerneMQ template: vernemq_mqtt_pubrec_invalid_error on: vernemq.mqtt_pubrec_invalid_error - class: Messaging + class: Workload + type: Messaging component: VerneMQ - type: Workload lookup: average -1m unaligned absolute units: messages every: 1m @@ -315,9 +299,9 @@ component: VerneMQ template: vernemq_mqtt_pubrel_received_reason_unsuccessful on: vernemq.mqtt_pubrel_received_reason - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute match-names of !success,* units: packets every: 1m @@ -328,9 +312,9 @@ component: VerneMQ template: vernemq_mqtt_pubrel_sent_reason_unsuccessful on: vernemq.mqtt_pubrel_sent_reason - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute match-names of !success,* units: packets every: 1m @@ -343,9 +327,9 @@ component: VerneMQ template: vernemq_mqtt_pubcomp_received_reason_unsuccessful on: vernemq.mqtt_pubcomp_received_reason - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute match-names of !success,* units: packets every: 1m @@ -356,9 +340,9 @@ component: VerneMQ template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful on: vernemq.mqtt_pubcomp_sent_reason - class: Messaging + class: Errors + type: Messaging component: VerneMQ - type: Errors lookup: average -1m unaligned absolute match-names of !success,* units: packets every: 1m @@ -369,9 +353,9 @@ component: VerneMQ template: vernemq_mqtt_pubcomp_unexpected on: vernemq.mqtt_pubcomp_invalid_error - class: Messaging + class: Workload + type: Messaging component: VerneMQ - type: Workload lookup: average -1m unaligned absolute units: messages every: 1m diff --git a/health/health.d/vsphere.conf b/health/health.d/vsphere.conf index aee7c5cd4..d8fc899b9 100644 --- a/health/health.d/vsphere.conf +++ b/health/health.d/vsphere.conf @@ -6,9 +6,9 @@ template: vsphere_vm_mem_usage on: vsphere.vm_mem_usage_percentage - class: Virtual Machine + class: Utilization + type: Virtual Machine component: Memory - type: Utilization hosts: * calc: $used units: % @@ -23,9 +23,9 @@ component: Memory template: vsphere_host_mem_usage on: vsphere.host_mem_usage_percentage - class: Virtual Machine + class: Utilization + type: Virtual Machine component: Memory - type: Utilization hosts: * calc: $used units: % @@ -39,9 +39,9 @@ component: Memory template: vsphere_inbound_packets_errors on: vsphere.net_errors_total - class: Virtual Machine + class: Errors + type: Virtual Machine component: Network - type: Errors hosts: * families: * lookup: sum -10m unaligned absolute match-names of rx @@ -51,9 +51,9 @@ component: Network template: vsphere_outbound_packets_errors on: vsphere.net_errors_total - class: Virtual Machine + class: Errors + type: Virtual Machine component: Network - type: Errors hosts: * families: * lookup: sum -10m unaligned absolute match-names of tx @@ -65,9 +65,9 @@ component: Network template: vsphere_inbound_packets_errors_ratio on: vsphere.net_packets_total - class: Virtual Machine + class: Errors + type: Virtual Machine component: Network - type: Errors hosts: * families: * lookup: sum -10m unaligned absolute match-names of rx @@ -81,9 +81,9 @@ component: Network template: vsphere_outbound_packets_errors_ratio on: vsphere.net_packets_total - class: Virtual Machine + class: Errors + type: Virtual Machine component: Network - type: Errors hosts: * families: * lookup: sum -10m unaligned absolute match-names of tx @@ -100,9 +100,9 @@ component: Network template: vsphere_cpu_usage on: vsphere.cpu_usage_total - class: Virtual Machine + class: Utilization + type: Virtual Machine component: CPU - type: Utilization hosts: * lookup: average -10m unaligned match-names of used units: % @@ -117,9 +117,9 @@ component: CPU template: vsphere_inbound_packets_dropped on: vsphere.net_drops_total - class: Virtual Machine + class: Errors + type: Virtual Machine component: Network - type: Errors hosts: * families: * lookup: sum -10m unaligned absolute match-names of rx @@ -129,9 +129,9 @@ component: Network template: vsphere_outbound_packets_dropped on: vsphere.net_drops_total - class: Virtual Machine + class: Errors + type: Virtual Machine component: Network - type: Errors hosts: * families: * lookup: sum -10m unaligned absolute match-names of tx @@ -143,9 +143,9 @@ component: Network template: vsphere_inbound_packets_dropped_ratio on: vsphere.net_packets_total - class: Virtual Machine + class: Errors + type: Virtual Machine component: Network - type: Errors hosts: * families: * lookup: sum -10m unaligned absolute match-names of rx @@ -159,9 +159,9 @@ component: Network template: vsphere_outbound_packets_dropped_ratio on: vsphere.net_packets_total - class: Virtual Machine + class: Errors + type: Virtual Machine component: Network - type: Errors hosts: * families: * lookup: sum -10m unaligned absolute match-names of tx diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf index 127c9a9c6..454e0abef 100644 --- a/health/health.d/web_log.conf +++ b/health/health.d/web_log.conf @@ -1,22 +1,4 @@ -# make sure we can collect web log data - - template: last_collected_secs - on: web_log.response_codes - class: Web Server -component: Web log - type: Latency - families: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - - # ----------------------------------------------------------------------------- # high level response code alarms @@ -29,9 +11,9 @@ component: Web log template: 1m_requests on: web_log.response_statuses - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: sum -1m unaligned calc: ($this == 0)?(1):($this) @@ -41,9 +23,9 @@ component: Web log template: 1m_successful on: web_log.response_statuses - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: sum -1m unaligned of successful_requests calc: $this * 100 / $1m_requests @@ -57,41 +39,39 @@ component: Web log template: 1m_redirects on: web_log.response_statuses - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: sum -1m unaligned of redirects calc: $this * 100 / $1m_requests units: % every: 10s warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 ) - crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h info: ratio of redirection HTTP requests over the last minute (3xx except 304) to: webmaster template: 1m_bad_requests on: web_log.response_statuses - class: Web Server + class: Errors + type: Web Server component: Web log - type: Errors families: * lookup: sum -1m unaligned of bad_requests calc: $this * 100 / $1m_requests units: % every: 10s warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 ) - crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h info: ratio of client error HTTP requests over the last minute (4xx except 401) to: webmaster template: 1m_internal_errors on: web_log.response_statuses - class: Web Server + class: Errors + type: Web Server component: Web log - type: Errors families: * lookup: sum -1m unaligned of server_errors calc: $this * 100 / $1m_requests @@ -114,9 +94,9 @@ component: Web log template: 1m_total_requests on: web_log.response_codes - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: sum -1m unaligned calc: ($this == 0)?(1):($this) @@ -126,9 +106,9 @@ component: Web log template: 1m_unmatched on: web_log.response_codes - class: Web Server + class: Errors + type: Web Server component: Web log - type: Errors families: * lookup: sum -1m unaligned of unmatched calc: $this * 100 / $1m_total_requests @@ -151,9 +131,9 @@ component: Web log template: 10m_response_time on: web_log.response_time - class: System + class: Latency + type: System component: Web log - type: Latency families: * lookup: average -10m unaligned of avg units: ms @@ -162,9 +142,9 @@ component: Web log template: web_slow on: web_log.response_time - class: Web Server + class: Latency + type: Web Server component: Web log - type: Latency families: * lookup: average -1m unaligned of avg units: ms @@ -191,9 +171,9 @@ component: Web log template: 5m_successful_old on: web_log.response_statuses - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: average -5m at -5m unaligned of successful_requests units: requests/s @@ -202,9 +182,9 @@ component: Web log template: 5m_successful on: web_log.response_statuses - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: average -5m unaligned of successful_requests units: requests/s @@ -213,9 +193,9 @@ component: Web log template: 5m_requests_ratio on: web_log.response_codes - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100) units: % @@ -233,23 +213,6 @@ component: Web log # ---------------------------------------------------GO-VERSION--------------------------------------------------------- -# make sure we can collect web log data - - template: web_log_last_collected_secs - on: web_log.requests - class: Web Server -component: Web log - type: Latency - families: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - # unmatched lines # the following alarms trigger only when there are enough data. @@ -261,9 +224,9 @@ component: Web log template: web_log_1m_total_requests on: web_log.requests - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: sum -1m unaligned calc: ($this == 0)?(1):($this) @@ -273,9 +236,9 @@ component: Web log template: web_log_1m_unmatched on: web_log.excluded_requests - class: Web Server + class: Errors + type: Web Server component: Web log - type: Errors families: * lookup: sum -1m unaligned of unmatched calc: $this * 100 / $web_log_1m_total_requests @@ -298,9 +261,9 @@ component: Web log template: web_log_1m_requests on: web_log.type_requests - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: sum -1m unaligned calc: ($this == 0)?(1):($this) @@ -310,9 +273,9 @@ component: Web log template: web_log_1m_successful on: web_log.type_requests - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: sum -1m unaligned of success calc: $this * 100 / $web_log_1m_requests @@ -326,41 +289,39 @@ component: Web log template: web_log_1m_redirects on: web_log.type_requests - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: sum -1m unaligned of redirect calc: $this * 100 / $web_log_1m_requests units: % every: 10s warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 ) - crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h info: ratio of redirection HTTP requests over the last minute (3xx except 304) to: webmaster template: web_log_1m_bad_requests on: web_log.type_requests - class: Web Server + class: Errors + type: Web Server component: Web log - type: Errors families: * lookup: sum -1m unaligned of bad calc: $this * 100 / $web_log_1m_requests units: % every: 10s warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 ) - crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h info: ratio of client error HTTP requests over the last minute (4xx except 401) to: webmaster template: web_log_1m_internal_errors on: web_log.type_requests - class: Web Server + class: Errors + type: Web Server component: Web log - type: Errors families: * lookup: sum -1m unaligned of error calc: $this * 100 / $web_log_1m_requests @@ -384,9 +345,9 @@ component: Web log template: web_log_10m_response_time on: web_log.request_processing_time - class: System + class: Latency + type: System component: Web log - type: Latency families: * lookup: average -10m unaligned of avg units: ms @@ -395,9 +356,9 @@ component: Web log template: web_log_web_slow on: web_log.request_processing_time - class: Web Server + class: Latency + type: Web Server component: Web log - type: Latency families: * lookup: average -1m unaligned of avg units: ms @@ -424,9 +385,9 @@ component: Web log template: web_log_5m_successful_old on: web_log.type_requests - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: average -5m at -5m unaligned of success units: requests/s @@ -435,9 +396,9 @@ component: Web log template: web_log_5m_successful on: web_log.type_requests - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * lookup: average -5m unaligned of success units: requests/s @@ -446,9 +407,9 @@ component: Web log template: web_log_5m_requests_ratio on: web_log.type_requests - class: Web Server + class: Workload + type: Web Server component: Web log - type: Workload families: * calc: ($web_log_5m_successful_old > 0)?($web_log_5m_successful * 100 / $web_log_5m_successful_old):(100) units: % diff --git a/health/health.d/whoisquery.conf b/health/health.d/whoisquery.conf index c6d3a9de0..be5eb58f9 100644 --- a/health/health.d/whoisquery.conf +++ b/health/health.d/whoisquery.conf @@ -1,26 +1,9 @@ -# make sure whoisquery is running - - template: whoisquery_last_collected_secs - on: whoisquery.time_until_expiration - class: Other -component: WHOIS - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 60s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - - template: whoisquery_days_until_expiration on: whoisquery.time_until_expiration - class: Other + class: Utilization + type: Other component: WHOIS - type: Utilization calc: $expiry units: seconds every: 60s diff --git a/health/health.d/wmi.conf b/health/health.d/wmi.conf index 6bd4e077f..90d39ce9d 100644 --- a/health/health.d/wmi.conf +++ b/health/health.d/wmi.conf @@ -1,29 +1,11 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - -## Availability - - template: wmi_last_collected_secs - on: cpu.collector_duration - class: Windows -component: Availability - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - ## CPU template: wmi_10min_cpu_usage on: wmi.cpu_utilization_total - class: Windows + class: Utilization + type: Windows component: CPU - type: Utilization os: linux hosts: * lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt @@ -40,9 +22,9 @@ component: CPU template: wmi_ram_in_use on: wmi.memory_utilization - class: Windows + class: Utilization + type: Windows component: Memory - type: Utilization os: linux hosts: * calc: ($used) * 100 / ($used + $available) @@ -56,9 +38,9 @@ component: Memory template: wmi_swap_in_use on: wmi.memory_swap_utilization - class: Windows + class: Utilization + type: Windows component: Memory - type: Utilization os: linux hosts: * calc: ($used) * 100 / ($used + $available) @@ -75,9 +57,9 @@ component: Memory template: wmi_inbound_packets_discarded on: wmi.net_discarded - class: Windows + class: Errors + type: Windows component: Network - type: Errors os: linux hosts: * families: * @@ -91,9 +73,9 @@ component: Network template: wmi_outbound_packets_discarded on: wmi.net_discarded - class: Windows + class: Errors + type: Windows component: Network - type: Errors os: linux hosts: * families: * @@ -107,9 +89,9 @@ component: Network template: wmi_inbound_packets_errors on: wmi.net_errors - class: Windows + class: Errors + type: Windows component: Network - type: Errors os: linux hosts: * families: * @@ -123,9 +105,9 @@ component: Network template: wmi_outbound_packets_errors on: wmi.net_errors - class: Windows + class: Errors + type: Windows component: Network - type: Errors os: linux hosts: * families: * @@ -142,9 +124,9 @@ component: Network template: wmi_disk_in_use on: wmi.logical_disk_utilization - class: Windows + class: Utilization + type: Windows component: Disk - type: Utilization os: linux hosts: * calc: ($used) * 100 / ($used + $free) diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf index 93c406b7a..fc69d0288 100644 --- a/health/health.d/x509check.conf +++ b/health/health.d/x509check.conf @@ -1,26 +1,9 @@ -# make sure x509check is running - - template: x509check_last_collected_secs - on: x509check.time_until_expiration - class: Certificates -component: x509 certificates - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 60s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - - template: x509check_days_until_expiration on: x509check.time_until_expiration - class: Certificates + class: Latency + type: Certificates component: x509 certificates - type: Latency calc: $expiry units: seconds every: 60s @@ -31,9 +14,9 @@ component: x509 certificates template: x509check_revocation_status on: x509check.revocation_status - class: Certificates + class: Errors + type: Certificates component: x509 certificates - type: Errors calc: $revoked every: 60s crit: $this != nan AND $this != 0 diff --git a/health/health.d/zfs.conf b/health/health.d/zfs.conf index d6f5fa2fe..785838d47 100644 --- a/health/health.d/zfs.conf +++ b/health/health.d/zfs.conf @@ -1,9 +1,9 @@ alarm: zfs_memory_throttle on: zfs.memory_ops - class: System + class: Utilization + type: System component: File system - type: Utilization lookup: sum -10m unaligned absolute of throttled units: events every: 1m @@ -16,9 +16,9 @@ component: File system template: zfs_pool_state_warn on: zfspool.state - class: System + class: Errors + type: System component: File system - type: Errors calc: $degraded units: boolean every: 10s @@ -29,9 +29,9 @@ component: File system template: zfs_pool_state_crit on: zfspool.state - class: System + class: Errors + type: System component: File system - type: Errors calc: $faulted + $unavail units: boolean every: 10s diff --git a/health/health.d/zookeeper.conf b/health/health.d/zookeeper.conf deleted file mode 100644 index 8c7d5a73d..000000000 --- a/health/health.d/zookeeper.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure zookeeper is running - - template: zookeeper_last_collected_secs - on: zookeeper.requests - class: KV Storage -component: ZooKeeper - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - diff --git a/health/health.h b/health/health.h index 56331b227..09040b3a8 100644 --- a/health/health.h +++ b/health/health.h @@ -3,7 +3,7 @@ #ifndef NETDATA_HEALTH_H #define NETDATA_HEALTH_H 1 -#include "../daemon/common.h" +#include "daemon/common.h" #define NETDATA_PLUGIN_HOOK_HEALTH \ { \ @@ -27,6 +27,7 @@ extern unsigned int default_health_enabled; #define HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS 0x00000040 #define HEALTH_ENTRY_FLAG_SAVED 0x10000000 +#define HEALTH_ENTRY_FLAG_ACLK_QUEUED 0x20000000 #define HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION 0x80000000 #ifndef HEALTH_LISTEN_PORT @@ -63,6 +64,7 @@ extern ALARM_ENTRY* health_create_alarm_entry( RRDHOST *host, uint32_t alarm_id, uint32_t alarm_event_id, + uuid_t config_hash_id, time_t when, const char *name, const char *chart, @@ -96,6 +98,8 @@ extern void *health_cmdapi_thread(void *ptr); extern void health_label_log_save(RRDHOST *host); +extern char *health_edit_command_from_source(const char *source); + extern SIMPLE_PATTERN *health_pattern_from_foreach(char *s); #endif //NETDATA_HEALTH_H diff --git a/health/health_config.c b/health/health_config.c index 756023715..35234df15 100644 --- a/health/health_config.c +++ b/health/health_config.c @@ -473,6 +473,29 @@ static inline char *health_source_file(size_t line, const char *file) { return strdupz(buffer); } +char *health_edit_command_from_source(const char *source) +{ + char buffer[FILENAME_MAX + 1]; + char *temp = strdupz(source); + char *line_num = strchr(temp, '@'); + char *file_no_path = strrchr(temp, '/'); + + if (likely(file_no_path && line_num)) { + *line_num = '\0'; + snprintfz( + buffer, + FILENAME_MAX, + "sudo %s/edit-config health.d/%s=%s", + netdata_configured_user_config_dir, + file_no_path + 1, + temp); + } else + buffer[0] = '\0'; + + freez(temp); + return strdupz(buffer); +} + static inline void strip_quotes(char *s) { while(*s) { if(*s == '\'' || *s == '"') *s = ' '; @@ -480,6 +503,40 @@ static inline void strip_quotes(char *s) { } } +static inline void alert_config_free(struct alert_config *cfg) +{ + freez(cfg->alarm); + freez(cfg->template_key); + freez(cfg->os); + freez(cfg->host); + freez(cfg->on); + freez(cfg->families); + freez(cfg->plugin); + freez(cfg->module); + freez(cfg->charts); + freez(cfg->lookup); + freez(cfg->calc); + freez(cfg->warn); + freez(cfg->crit); + freez(cfg->every); + freez(cfg->green); + freez(cfg->red); + freez(cfg->exec); + freez(cfg->to); + freez(cfg->units); + freez(cfg->info); + freez(cfg->classification); + freez(cfg->component); + freez(cfg->type); + freez(cfg->delay); + freez(cfg->options); + freez(cfg->repeat); + freez(cfg->host_labels); + freez(cfg->p_db_lookup_dimensions); + freez(cfg->p_db_lookup_method); + freez(cfg); +} + static int health_readfile(const char *filename, void *data) { RRDHOST *host = (RRDHOST *)data; @@ -554,6 +611,7 @@ static int health_readfile(const char *filename, void *data) { RRDCALC *rc = NULL; RRDCALCTEMPLATE *rt = NULL; + struct alert_config *alert_cfg = NULL; int ignore_this = 0; size_t line = 0, append = 0; @@ -603,16 +661,18 @@ static int health_readfile(const char *filename, void *data) { if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) { if(rc) { - if(ignore_this || !rrdcalc_add_alarm_from_config(host, rc)) { + if(ignore_this || !alert_hash_and_store_config(rc->config_hash_id, alert_cfg) || !rrdcalc_add_alarm_from_config(host, rc)) { rrdcalc_free(rc); + alert_config_free(alert_cfg); } // health_add_alarms_loop(host, rc, ignore_this) ; } if(rt) { - if (ignore_this || !rrdcalctemplate_add_template_from_config(host, rt)) + if (ignore_this || !alert_hash_and_store_config(rt->config_hash_id, alert_cfg) || !rrdcalctemplate_add_template_from_config(host, rt)) { rrdcalctemplate_free(rt); - + alert_config_free(alert_cfg); + } rt = NULL; } @@ -629,25 +689,30 @@ static int health_readfile(const char *filename, void *data) { rc->old_status = RRDCALC_STATUS_UNINITIALIZED; rc->warn_repeat_every = host->health_default_warn_repeat_every; rc->crit_repeat_every = host->health_default_crit_repeat_every; + alert_cfg = callocz(1, sizeof(struct alert_config)); if(rrdvar_fix_name(rc->name)) error("Health configuration renamed alarm '%s' to '%s'", value, rc->name); + alert_cfg->alarm = strdupz(rc->name); ignore_this = 0; } else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) { if(rc) { // health_add_alarms_loop(host, rc, ignore_this) ; - if(ignore_this || !rrdcalc_add_alarm_from_config(host, rc)) { + if(ignore_this || !alert_hash_and_store_config(rc->config_hash_id, alert_cfg) || !rrdcalc_add_alarm_from_config(host, rc)) { rrdcalc_free(rc); + alert_config_free(alert_cfg); } rc = NULL; } if(rt) { - if(ignore_this || !rrdcalctemplate_add_template_from_config(host, rt)) + if(ignore_this || !alert_hash_and_store_config(rt->config_hash_id, alert_cfg) || !rrdcalctemplate_add_template_from_config(host, rt)) { rrdcalctemplate_free(rt); + alert_config_free(alert_cfg); + } } rt = callocz(1, sizeof(RRDCALCTEMPLATE)); @@ -659,14 +724,17 @@ static int health_readfile(const char *filename, void *data) { rt->delay_multiplier = 1.0; rt->warn_repeat_every = host->health_default_warn_repeat_every; rt->crit_repeat_every = host->health_default_crit_repeat_every; + alert_cfg = callocz(1, sizeof(struct alert_config)); if(rrdvar_fix_name(rt->name)) error("Health configuration renamed template '%s' to '%s'", value, rt->name); + alert_cfg->template_key = strdupz(rt->name); ignore_this = 0; } else if(hash == hash_os && !strcasecmp(key, HEALTH_OS_KEY)) { char *os_match = value; + if (alert_cfg) alert_cfg->os = strdupz(value); SIMPLE_PATTERN *os_pattern = simple_pattern_create(os_match, NULL, SIMPLE_PATTERN_EXACT); if(!simple_pattern_matches(os_pattern, host->os)) { @@ -683,6 +751,7 @@ static int health_readfile(const char *filename, void *data) { } else if(hash == hash_host && !strcasecmp(key, HEALTH_HOST_KEY)) { char *host_match = value; + if (alert_cfg) alert_cfg->host = strdupz(value); SIMPLE_PATTERN *host_pattern = simple_pattern_create(host_match, NULL, SIMPLE_PATTERN_EXACT); if(!simple_pattern_matches(host_pattern, host->hostname)) { @@ -699,6 +768,7 @@ static int health_readfile(const char *filename, void *data) { } else if(rc) { if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) { + alert_cfg->on = strdupz(value); if(rc->chart) { if(strcmp(rc->chart, value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", @@ -710,6 +780,7 @@ static int health_readfile(const char *filename, void *data) { rc->hash_chart = simple_hash(rc->chart); } else if(hash == hash_class && !strcasecmp(key, HEALTH_CLASS_KEY)) { + alert_cfg->classification = strdupz(value); if(rc->classification) { if(strcmp(rc->classification, value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", @@ -721,6 +792,7 @@ static int health_readfile(const char *filename, void *data) { strip_quotes(rc->classification); } else if(hash == hash_component && !strcasecmp(key, HEALTH_COMPONENT_KEY)) { + alert_cfg->component = strdupz(value); if(rc->component) { if(strcmp(rc->component, value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", @@ -732,6 +804,7 @@ static int health_readfile(const char *filename, void *data) { strip_quotes(rc->component); } else if(hash == hash_type && !strcasecmp(key, HEALTH_TYPE_KEY)) { + alert_cfg->type = strdupz(value); if(rc->type) { if(strcmp(rc->type, value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", @@ -743,18 +816,32 @@ static int health_readfile(const char *filename, void *data) { strip_quotes(rc->type); } else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) { + alert_cfg->lookup = strdupz(value); health_parse_db_lookup(line, filename, value, &rc->group, &rc->after, &rc->before, &rc->update_every, &rc->options, &rc->dimensions, &rc->foreachdim); if(rc->foreachdim) { rc->spdim = health_pattern_from_foreach(rc->foreachdim); } + if (rc->after) { + if (rc->dimensions) + alert_cfg->p_db_lookup_dimensions = strdupz(rc->dimensions); + if (rc->group) + alert_cfg->p_db_lookup_method = strdupz(group_method2string(rc->group)); + alert_cfg->p_db_lookup_options = rc->options; + alert_cfg->p_db_lookup_after = rc->after; + alert_cfg->p_db_lookup_before = rc->before; + alert_cfg->p_update_every = rc->update_every; + } } else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) { + alert_cfg->every = strdupz(value); if(!config_parse_duration(value, &rc->update_every)) error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.", line, filename, rc->name, key, value); + alert_cfg->p_update_every = rc->update_every; } else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) { + alert_cfg->green = strdupz(value); char *e; rc->green = str2ld(value, &e); if(e && *e) { @@ -763,6 +850,7 @@ static int health_readfile(const char *filename, void *data) { } } else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) { + alert_cfg->red = strdupz(value); char *e; rc->red = str2ld(value, &e); if(e && *e) { @@ -771,6 +859,7 @@ static int health_readfile(const char *filename, void *data) { } } else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) { + alert_cfg->calc = strdupz(value); const char *failed_at = NULL; int error = 0; rc->calculation = expression_parse(value, &failed_at, &error); @@ -780,6 +869,7 @@ static int health_readfile(const char *filename, void *data) { } } else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) { + alert_cfg->warn = strdupz(value); const char *failed_at = NULL; int error = 0; rc->warning = expression_parse(value, &failed_at, &error); @@ -789,6 +879,7 @@ static int health_readfile(const char *filename, void *data) { } } else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) { + alert_cfg->crit = strdupz(value); const char *failed_at = NULL; int error = 0; rc->critical = expression_parse(value, &failed_at, &error); @@ -798,6 +889,7 @@ static int health_readfile(const char *filename, void *data) { } } else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) { + alert_cfg->exec = strdupz(value); if(rc->exec) { if(strcmp(rc->exec, value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", @@ -808,6 +900,7 @@ static int health_readfile(const char *filename, void *data) { rc->exec = strdupz(value); } else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) { + alert_cfg->to = strdupz(value); if(rc->recipient) { if(strcmp(rc->recipient, value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", @@ -818,6 +911,7 @@ static int health_readfile(const char *filename, void *data) { rc->recipient = strdupz(value); } else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) { + alert_cfg->units = strdupz(value); if(rc->units) { if(strcmp(rc->units, value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", @@ -829,6 +923,7 @@ static int health_readfile(const char *filename, void *data) { strip_quotes(rc->units); } else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) { + alert_cfg->info = strdupz(value); if(rc->info) { if(strcmp(rc->info, value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", @@ -840,17 +935,21 @@ static int health_readfile(const char *filename, void *data) { strip_quotes(rc->info); } else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) { + alert_cfg->delay = strdupz(value); health_parse_delay(line, filename, value, &rc->delay_up_duration, &rc->delay_down_duration, &rc->delay_max_duration, &rc->delay_multiplier); } else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) { + alert_cfg->options = strdupz(value); rc->options |= health_parse_options(value); } else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){ + alert_cfg->repeat = strdupz(value); health_parse_repeat(line, filename, value, &rc->warn_repeat_every, &rc->crit_repeat_every); } else if(hash == hash_host_label && !strcasecmp(key, HEALTH_HOST_LABEL_KEY)) { + alert_cfg->host_labels = strdupz(value); if(rc->labels) { if(strcmp(rc->labels, value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'.", @@ -864,6 +963,7 @@ static int health_readfile(const char *filename, void *data) { rc->splabels = simple_pattern_create(rc->labels, NULL, SIMPLE_PATTERN_EXACT); } else if(hash == hash_plugin && !strcasecmp(key, HEALTH_PLUGIN_KEY)) { + alert_cfg->plugin = strdupz(value); freez(rc->plugin_match); simple_pattern_free(rc->plugin_pattern); @@ -871,6 +971,7 @@ static int health_readfile(const char *filename, void *data) { rc->plugin_pattern = simple_pattern_create(rc->plugin_match, NULL, SIMPLE_PATTERN_EXACT); } else if(hash == hash_module && !strcasecmp(key, HEALTH_MODULE_KEY)) { + alert_cfg->module = strdupz(value); freez(rc->module_match); simple_pattern_free(rc->module_pattern); @@ -884,6 +985,7 @@ static int health_readfile(const char *filename, void *data) { } else if(rt) { if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) { + alert_cfg->on = strdupz(value); if(rt->context) { if(strcmp(rt->context, value) != 0) error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", @@ -895,6 +997,7 @@ static int health_readfile(const char *filename, void *data) { rt->hash_context = simple_hash(rt->context); } else if(hash == hash_class && !strcasecmp(key, HEALTH_CLASS_KEY)) { + alert_cfg->classification = strdupz(value); if(rt->classification) { if(strcmp(rt->classification, value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", @@ -906,6 +1009,7 @@ static int health_readfile(const char *filename, void *data) { strip_quotes(rt->classification); } else if(hash == hash_component && !strcasecmp(key, HEALTH_COMPONENT_KEY)) { + alert_cfg->component = strdupz(value); if(rt->component) { if(strcmp(rt->component, value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", @@ -917,6 +1021,7 @@ static int health_readfile(const char *filename, void *data) { strip_quotes(rt->component); } else if(hash == hash_type && !strcasecmp(key, HEALTH_TYPE_KEY)) { + alert_cfg->type = strdupz(value); if(rt->type) { if(strcmp(rt->type, value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", @@ -928,6 +1033,7 @@ static int health_readfile(const char *filename, void *data) { strip_quotes(rt->type); } else if(hash == hash_families && !strcasecmp(key, HEALTH_FAMILIES_KEY)) { + alert_cfg->families = strdupz(value); freez(rt->family_match); simple_pattern_free(rt->family_pattern); @@ -935,6 +1041,7 @@ static int health_readfile(const char *filename, void *data) { rt->family_pattern = simple_pattern_create(rt->family_match, NULL, SIMPLE_PATTERN_EXACT); } else if(hash == hash_plugin && !strcasecmp(key, HEALTH_PLUGIN_KEY)) { + alert_cfg->plugin = strdupz(value); freez(rt->plugin_match); simple_pattern_free(rt->plugin_pattern); @@ -942,6 +1049,7 @@ static int health_readfile(const char *filename, void *data) { rt->plugin_pattern = simple_pattern_create(rt->plugin_match, NULL, SIMPLE_PATTERN_EXACT); } else if(hash == hash_module && !strcasecmp(key, HEALTH_MODULE_KEY)) { + alert_cfg->module = strdupz(value); freez(rt->module_match); simple_pattern_free(rt->module_pattern); @@ -949,6 +1057,7 @@ static int health_readfile(const char *filename, void *data) { rt->module_pattern = simple_pattern_create(rt->module_match, NULL, SIMPLE_PATTERN_EXACT); } else if(hash == hash_charts && !strcasecmp(key, HEALTH_CHARTS_KEY)) { + alert_cfg->charts = strdupz(value); freez(rt->charts_match); simple_pattern_free(rt->charts_pattern); @@ -956,18 +1065,32 @@ static int health_readfile(const char *filename, void *data) { rt->charts_pattern = simple_pattern_create(rt->charts_match, NULL, SIMPLE_PATTERN_EXACT); } else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) { + alert_cfg->lookup = strdupz(value); health_parse_db_lookup(line, filename, value, &rt->group, &rt->after, &rt->before, &rt->update_every, &rt->options, &rt->dimensions, &rt->foreachdim); if(rt->foreachdim) { rt->spdim = health_pattern_from_foreach(rt->foreachdim); } + if (rt->after) { + if (rt->dimensions) + alert_cfg->p_db_lookup_dimensions = strdupz(rt->dimensions); + if (rt->group) + alert_cfg->p_db_lookup_method = strdupz(group_method2string(rt->group)); + alert_cfg->p_db_lookup_options = rt->options; + alert_cfg->p_db_lookup_after = rt->after; + alert_cfg->p_db_lookup_before = rt->before; + alert_cfg->p_update_every = rt->update_every; + } } else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) { + alert_cfg->every = strdupz(value); if(!config_parse_duration(value, &rt->update_every)) error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' cannot parse duration: '%s'.", line, filename, rt->name, key, value); + alert_cfg->p_update_every = rt->update_every; } else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) { + alert_cfg->green = strdupz(value); char *e; rt->green = str2ld(value, &e); if(e && *e) { @@ -976,6 +1099,7 @@ static int health_readfile(const char *filename, void *data) { } } else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) { + alert_cfg->red = strdupz(value); char *e; rt->red = str2ld(value, &e); if(e && *e) { @@ -984,6 +1108,7 @@ static int health_readfile(const char *filename, void *data) { } } else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) { + alert_cfg->calc = strdupz(value); const char *failed_at = NULL; int error = 0; rt->calculation = expression_parse(value, &failed_at, &error); @@ -993,6 +1118,7 @@ static int health_readfile(const char *filename, void *data) { } } else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) { + alert_cfg->warn = strdupz(value); const char *failed_at = NULL; int error = 0; rt->warning = expression_parse(value, &failed_at, &error); @@ -1002,6 +1128,7 @@ static int health_readfile(const char *filename, void *data) { } } else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) { + alert_cfg->crit = strdupz(value); const char *failed_at = NULL; int error = 0; rt->critical = expression_parse(value, &failed_at, &error); @@ -1011,6 +1138,7 @@ static int health_readfile(const char *filename, void *data) { } } else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) { + alert_cfg->exec = strdupz(value); if(rt->exec) { if(strcmp(rt->exec, value) != 0) error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", @@ -1021,6 +1149,7 @@ static int health_readfile(const char *filename, void *data) { rt->exec = strdupz(value); } else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) { + alert_cfg->to = strdupz(value); if(rt->recipient) { if(strcmp(rt->recipient, value) != 0) error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", @@ -1031,6 +1160,7 @@ static int health_readfile(const char *filename, void *data) { rt->recipient = strdupz(value); } else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) { + alert_cfg->units = strdupz(value); if(rt->units) { if(strcmp(rt->units, value) != 0) error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", @@ -1042,6 +1172,7 @@ static int health_readfile(const char *filename, void *data) { strip_quotes(rt->units); } else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) { + alert_cfg->info = strdupz(value); if(rt->info) { if(strcmp(rt->info, value) != 0) error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", @@ -1053,17 +1184,21 @@ static int health_readfile(const char *filename, void *data) { strip_quotes(rt->info); } else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) { + alert_cfg->delay = strdupz(value); health_parse_delay(line, filename, value, &rt->delay_up_duration, &rt->delay_down_duration, &rt->delay_max_duration, &rt->delay_multiplier); } else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) { + alert_cfg->options = strdupz(value); rt->options |= health_parse_options(value); } else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){ + alert_cfg->repeat = strdupz(value); health_parse_repeat(line, filename, value, &rt->warn_repeat_every, &rt->crit_repeat_every); } else if(hash == hash_host_label && !strcasecmp(key, HEALTH_HOST_LABEL_KEY)) { + alert_cfg->host_labels = strdupz(value); if(rt->labels) { if(strcmp(rt->labels, value) != 0) error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", @@ -1089,16 +1224,20 @@ static int health_readfile(const char *filename, void *data) { if(rc) { //health_add_alarms_loop(host, rc, ignore_this) ; - if(ignore_this || !rrdcalc_add_alarm_from_config(host, rc)) { + if(ignore_this || !alert_hash_and_store_config(rc->config_hash_id, alert_cfg) || !rrdcalc_add_alarm_from_config(host, rc)) { rrdcalc_free(rc); } } if(rt) { - if(ignore_this || !rrdcalctemplate_add_template_from_config(host, rt)) + if(ignore_this || !alert_hash_and_store_config(rt->config_hash_id, alert_cfg) || !rrdcalctemplate_add_template_from_config(host, rt)) { rrdcalctemplate_free(rt); + } } + if (alert_cfg) + alert_config_free(alert_cfg); + fclose(fp); return 1; } diff --git a/health/health_json.c b/health/health_json.c index 4df44611c..a21d5a4fd 100644 --- a/health/health_json.c +++ b/health/health_json.c @@ -14,12 +14,19 @@ void health_string2json(BUFFER *wb, const char *prefix, const char *label, const } void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) { + char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0"); + char config_hash_id[GUID_LEN + 1]; + uuid_unparse_lower(ae->config_hash_id, config_hash_id); + buffer_sprintf(wb, "\n\t{\n" "\t\t\"hostname\": \"%s\",\n" + "\t\t\"utc_offset\": %d,\n" + "\t\t\"timezone\": \"%s\",\n" "\t\t\"unique_id\": %u,\n" "\t\t\"alarm_id\": %u,\n" "\t\t\"alarm_event_id\": %u,\n" + "\t\t\"config_hash_id\": \"%s\",\n" "\t\t\"name\": \"%s\",\n" "\t\t\"chart\": \"%s\",\n" "\t\t\"family\": \"%s\",\n" @@ -34,6 +41,7 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) "\t\t\"recipient\": \"%s\",\n" "\t\t\"exec_code\": %d,\n" "\t\t\"source\": \"%s\",\n" + "\t\t\"command\": \"%s\",\n" "\t\t\"units\": \"%s\",\n" "\t\t\"when\": %lu,\n" "\t\t\"duration\": %lu,\n" @@ -49,9 +57,12 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) "\t\t\"last_repeat\": \"%lu\",\n" "\t\t\"silenced\": \"%s\",\n" , host->hostname + , host->utc_offset + , host->abbrev_timezone , ae->unique_id , ae->alarm_id , ae->alarm_event_id + , config_hash_id , ae->name , ae->chart , ae->family @@ -66,6 +77,7 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) , ae->recipient?ae->recipient:host->health_default_recipient , ae->exec_code , ae->source + , edit_command , ae->units?ae->units:"" , (unsigned long)ae->when , (unsigned long)ae->duration @@ -114,6 +126,7 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) buffer_strcat(wb, "\t}"); freez(replaced_info); + freez(edit_command); } void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart) { @@ -178,9 +191,13 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC } } + char hash_id[GUID_LEN + 1]; + uuid_unparse_lower(rc->config_hash_id, hash_id); + buffer_sprintf(wb, "\t\t\"%s.%s\": {\n" "\t\t\t\"id\": %lu,\n" + "\t\t\t\"config_hash_id\": \"%s\",\n" "\t\t\t\"name\": \"%s\",\n" "\t\t\t\"chart\": \"%s\",\n" "\t\t\t\"family\": \"%s\",\n" @@ -212,6 +229,7 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC "\t\t\t\"last_repeat\": \"%lu\",\n" , rc->chart, rc->name , (unsigned long)rc->id + , hash_id , rc->name , rc->chart , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:"" diff --git a/health/health_log.c b/health/health_log.c index de0a0883b..d20085d9e 100644 --- a/health/health_log.c +++ b/health/health_log.c @@ -38,39 +38,41 @@ static inline void health_log_rotate(RRDHOST *host) { } if(unlikely(host->health_log_entries_written > rotate_every)) { - health_alarm_log_close(host); + if(unlikely(host->health_log_fp)) { + health_alarm_log_close(host); - char old_filename[FILENAME_MAX + 1]; - snprintfz(old_filename, FILENAME_MAX, "%s.old", host->health_log_filename); + char old_filename[FILENAME_MAX + 1]; + snprintfz(old_filename, FILENAME_MAX, "%s.old", host->health_log_filename); - if(unlink(old_filename) == -1 && errno != ENOENT) - error("HEALTH [%s]: cannot remove old alarms log file '%s'", host->hostname, old_filename); + if(unlink(old_filename) == -1 && errno != ENOENT) + error("HEALTH [%s]: cannot remove old alarms log file '%s'", host->hostname, old_filename); - if(link(host->health_log_filename, old_filename) == -1 && errno != ENOENT) - error("HEALTH [%s]: cannot move file '%s' to '%s'.", host->hostname, host->health_log_filename, old_filename); + if(link(host->health_log_filename, old_filename) == -1 && errno != ENOENT) + error("HEALTH [%s]: cannot move file '%s' to '%s'.", host->hostname, host->health_log_filename, old_filename); - if(unlink(host->health_log_filename) == -1 && errno != ENOENT) - error("HEALTH [%s]: cannot remove old alarms log file '%s'", host->hostname, host->health_log_filename); + if(unlink(host->health_log_filename) == -1 && errno != ENOENT) + error("HEALTH [%s]: cannot remove old alarms log file '%s'", host->hostname, host->health_log_filename); - // open it with truncate - host->health_log_fp = fopen(host->health_log_filename, "w"); + // open it with truncate + host->health_log_fp = fopen(host->health_log_filename, "w"); - if(host->health_log_fp) - fclose(host->health_log_fp); - else - error("HEALTH [%s]: cannot truncate health log '%s'", host->hostname, host->health_log_filename); + if(host->health_log_fp) + fclose(host->health_log_fp); + else + error("HEALTH [%s]: cannot truncate health log '%s'", host->hostname, host->health_log_filename); - host->health_log_fp = NULL; + host->health_log_fp = NULL; - host->health_log_entries_written = 0; - health_alarm_log_open(host); + host->health_log_entries_written = 0; + health_alarm_log_open(host); + } } } inline void health_label_log_save(RRDHOST *host) { health_log_rotate(host); - if(likely(host->health_log_fp)) { + if(unlikely(host->health_log_fp)) { BUFFER *wb = buffer_create(1024); rrdhost_check_rdlock(host); netdata_rwlock_rdlock(&host->labels.labels_rwlock); @@ -101,7 +103,7 @@ inline void health_label_log_save(RRDHOST *host) { inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { health_log_rotate(host); - if(likely(host->health_log_fp)) { + if(unlikely(host->health_log_fp)) { if(unlikely(fprintf(host->health_log_fp , "%c\t%s" "\t%08x\t%08x\t%08x\t%08x\t%08x" @@ -155,13 +157,12 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { ae->flags |= HEALTH_ENTRY_FLAG_SAVED; host->health_log_entries_written++; } - } + }else + sql_health_alarm_log_save(host, ae); + #ifdef ENABLE_ACLK if (netdata_cloud_setting) { - if ((ae->new_status == RRDCALC_STATUS_WARNING || ae->new_status == RRDCALC_STATUS_CRITICAL) || - ((ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL))) { - aclk_update_alarm(host, ae); - } + sql_queue_alarm_to_aclk(host, ae); } #endif } @@ -368,7 +369,7 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char ae->last_repeat = last_repeat; - if (likely(entries > 28)) { + if (likely(entries > 30)) { freez(ae->classification); ae->classification = strdupz(pointers[28]); if(!*ae->classification) { freez(ae->classification); ae->classification = NULL; } @@ -392,9 +393,13 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char if(unlikely(*pointers[0] == 'A')) { ae->next = host->health_log.alarms; host->health_log.alarms = ae; + sql_health_alarm_log_insert(host, ae); loaded++; } - else updated++; + else { + sql_health_alarm_log_update(host, ae); + updated++; + } if(unlikely(ae->unique_id > host->health_max_unique_id)) host->health_max_unique_id = ae->unique_id; @@ -444,8 +449,6 @@ inline void health_alarm_log_load(RRDHOST *host) { health_alarm_log_read(host, fp, host->health_log_filename); fclose(fp); } - - health_alarm_log_open(host); } @@ -456,6 +459,7 @@ inline ALARM_ENTRY* health_create_alarm_entry( RRDHOST *host, uint32_t alarm_id, uint32_t alarm_event_id, + uuid_t config_hash_id, time_t when, const char *name, const char *chart, @@ -487,6 +491,8 @@ inline ALARM_ENTRY* health_create_alarm_entry( ae->hash_chart = simple_hash(ae->chart); } + uuid_copy(ae->config_hash_id, *((uuid_t *) config_hash_id)); + if(family) ae->family = strdupz(family); diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in index 9a3a80ad6..08a32ff10 100755 --- a/health/notifications/alarm-notify.sh.in +++ b/health/notifications/alarm-notify.sh.in @@ -239,6 +239,11 @@ else calc_param_values="${22}" # the values of the parameters in the expression, at the time of the evaluation total_warnings="${23}" # Total number of alarms in WARNING state total_critical="${24}" # Total number of alarms in CRITICAL state + total_warn_alarms="${25}" # List of alarms in warning state + total_crit_alarms="${26}" # List of alarms in critical state + classification="${27}" # The class field from .conf files + edit_command_line="${28}" # The command to edit the alarm, with the line number + sender_host="${29}" # The host sending this notification fi # ----------------------------------------------------------------------------- @@ -252,6 +257,17 @@ else host="${args_host}" fi +# ----------------------------------------------------------------------------- +# Do the same for sender_host (find a suitable hostname to use, if netdata did not supply a hostname) + +if [ -z ${sender_host} ]; then + this_host=$(hostname -s 2>/dev/null) + s_host="${this_host}" + sender_host="${this_host}" +else + s_host="${sender_host}" +fi + # ----------------------------------------------------------------------------- # screen statuses we don't need to send a notification @@ -303,7 +319,7 @@ SLACK_WEBHOOK_URL= # Microsoft Teams configs MSTEAMS_WEBHOOK_URL= -# Legacy Microsoft Teams configs for backwards compatability: +# Legacy Microsoft Teams configs for backwards compatibility: declare -A role_recipients_msteam # rocketchat configs @@ -810,6 +826,14 @@ date=$(date --date=@${when} "${date_format}" 2>/dev/null) [ -z "${date}" ] && date=$(date --date=@${when} 2>/dev/null) [ -z "${date}" ] && date=$(date 2>/dev/null) +# ----------------------------------------------------------------------------- +# get the date in utc the alarm happened + +date_utc=$(date --date=@${when} "${date_format}" -u 2>/dev/null) +[ -z "${date_utc}" ] && date_utc=$(date -u "${date_format}" 2>/dev/null) +[ -z "${date_utc}" ] && date_utc=$(date -u --date=@${when} 2>/dev/null) +[ -z "${date_utc}" ] && date_utc=$(date -u 2>/dev/null) + # ---------------------------------------------------------------------------- # prepare some extra headers if we've been asked to thread e-mails if [ "${SEND_EMAIL}" == "YES" ] && [ "${EMAIL_THREADING}" != "NO" ]; then @@ -915,7 +939,7 @@ send_email() { fi [ -n "${sender_email}" ] && opts+=(-f "${sender_email}") - [ -n "${sender_name}" ] && sendmail --help 2>&1 | grep -q "\-F " && opts+=(-F "${sender_name}") + [ -n "${sender_name}" ] && ${sendmail} -F 2>&1 | head -1 | grep -qv "sendmail: unrecognized option: F" && opts+=(-F "${sender_name}") if [ "${debug}" = "1" ]; then echo >&2 "--- BEGIN sendmail command ---" @@ -1364,15 +1388,15 @@ EOF )" # Replacing in the webhook CHANNEL string by the MS Teams channel name from conf file. - webhook="${webhook//CHANNEL/${channel}}" + cur_webhook="${webhook//CHANNEL/${channel}}" - httpcode=$(docurl -H "Content-Type: application/json" -d "${payload}" "${webhook}") + httpcode=$(docurl -H "Content-Type: application/json" -d "${payload}" "${cur_webhook}") if [ "${httpcode}" = "200" ]; then - info "sent Microsoft team notification for: ${host} ${chart}.${name} is ${status} to '${webhook}'" + info "sent Microsoft team notification for: ${host} ${chart}.${name} is ${status} to '${cur_webhook}'" sent=$((sent + 1)) else - error "failed to send Microsoft team notification for: ${host} ${chart}.${name} is ${status} to '${webhook}', with HTTP response status code ${httpcode}." + error "failed to send Microsoft team notification for: ${host} ${chart}.${name} is ${status} to '${cur_webhook}', with HTTP response status code ${httpcode}." fi done @@ -2113,12 +2137,12 @@ send_dynatrace() { [ "${SEND_DYNATRACE}" != "YES" ] && return 1 local dynatrace_url="${DYNATRACE_SERVER}/e/${DYNATRACE_SPACE}/api/v1/events" - local description="NetData Notification for: ${host} ${chart}.${name} is ${status}" + local description="Netdata Notification for: ${host} ${chart}.${name} is ${status}" local payload="" payload=$(cat </dev/null url_family="${REPLY}" urlencode "${name}" >/dev/null url_name="${REPLY}" +urlencode "${value_string}" >/dev/null +url_value_string="${REPLY}" -redirect_params="host=${url_host}&chart=${url_chart}&family=${url_family}&alarm=${url_name}&alarm_unique_id=${unique_id}&alarm_id=${alarm_id}&alarm_event_id=${event_id}&alarm_when=${when}" +redirect_params="host=${url_host}&chart=${url_chart}&family=${url_family}&alarm=${url_name}&alarm_unique_id=${unique_id}&alarm_id=${alarm_id}&alarm_event_id=${event_id}&alarm_when=${when}&alarm_status=${status}&alarm_chart=${chart}&alarm_value=${url_value_string}" GOTOCLOUD=0 if [ "${NETDATA_REGISTRY_URL}" == "https://registry.my-netdata.io" ]; then @@ -2284,9 +2310,9 @@ fi if [ ${GOTOCLOUD} -eq 0 ]; then goto_url="${NETDATA_REGISTRY_URL}/goto-host-from-alarm.html?${redirect_params}" else - # Temporarily disable alarm redirection, as the cloud endpoint no longer exists. This functionality will be restored after discussion on #9487. For now, just lead to netdata.cloud - #goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentID=${NETDATA_REGISTRY_UNIQUE_ID}&${redirect_params}" - goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}" + # Temporarily disable alarm redirection, as the cloud endpoint no longer exists. This functionality will be restored after discussion on #9487. For now, just lead to netdata.cloud + # Re-allow alarm redirection, for alarms 2.0, new template + goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentId=${NETDATA_REGISTRY_UNIQUE_ID}&${redirect_params}" fi # the severity of the alarm @@ -2311,48 +2337,79 @@ alarm="${name//_/ } = ${value_string}" # the image of the alarm image="${images_base_url}/images/banner-icon-144x144.png" +# have a default email status, in case the following case does not catch it +status_email_subject="${status}" + # prepare the title based on status case "${status}" in CRITICAL) image="${images_base_url}/images/alert-128-red.png" + alarm_badge="${NETDATA_REGISTRY_CLOUD_BASE_URL}/static/email/img/label_critical.png" status_message="is critical" + status_email_subject="Critical" color="#ca414b" + rich_status_raised_for="Raised to critical, for ${non_clear_duration_txt}" + background_color="#FFEBEF" + border_color="#FF4136" + text_color="#FF4136" + action_text_color="#FFFFFF" ;; WARNING) image="${images_base_url}/images/alert-128-orange.png" + alarm_badge="${NETDATA_REGISTRY_CLOUD_BASE_URL}/static/email/img/label_warning.png" status_message="needs attention" + status_email_subject="Warning" color="#ffc107" + rich_status_raised_for="Raised to warning, for ${non_clear_duration_txt}" + background_color="#FFF8E1" + border_color="#FFC300" + text_color="#536775" + action_text_color="#35414A" ;; CLEAR) image="${images_base_url}/images/check-mark-2-128-green.png" + alarm_badge="${NETDATA_REGISTRY_CLOUD_BASE_URL}/static/email/img/label_recovered.png" status_message="recovered" + status_email_subject="Clear" color="#77ca6d" + rich_status_raised_for= + background_color="#E5F5E8" + border_color="#68C47D" + text_color="#00AB44" + action_text_color="#FFFFFF" ;; esac +# the html email subject +html_email_subject="${status_email_subject}, ${name} = ${value_string}, on ${host}" + if [ "${status}" = "CLEAR" ]; then severity="Recovered from ${old_status}" if [ ${non_clear_duration} -gt ${duration} ]; then raised_for="(alarm was raised for ${non_clear_duration_txt})" fi + rich_status_raised_for="Recovered from ${old_status,,}, ${raised_for}" # don't show the value when the status is CLEAR # for certain alarms, this value might not have any meaning alarm="${name//_/ } ${raised_for}" + html_email_subject="${status_email_subject}, ${name} ${raised_for}, on ${host}" elif { [ "${old_status}" = "WARNING" ] && [ "${status}" = "CRITICAL" ]; }; then severity="Escalated to ${status}" if [ ${non_clear_duration} -gt ${duration} ]; then raised_for="(alarm is raised for ${non_clear_duration_txt})" fi + rich_status_raised_for="Escalated to critical, ${raised_for}" elif { [ "${old_status}" = "CRITICAL" ] && [ "${status}" = "WARNING" ]; }; then severity="Demoted to ${status}" if [ ${non_clear_duration} -gt ${duration} ]; then raised_for="(alarm is raised for ${non_clear_duration_txt})" fi + rich_status_raised_for="Demoted to warning, ${raised_for}" else raised_for= @@ -2628,6 +2685,13 @@ Subject: ${host} ${status_message} - ${name//_/ } - ${chart} MIME-Version: 1.0 Content-Type: multipart/alternative; boundary="multipart-boundary" ${email_thread_headers} +X-Netdata-Severity: ${status,,} +X-Netdata-Alert-Name: $name +X-Netdata-Chart: $chart +X-Netdata-Family: $family +X-Netdata-Classification: $classification +X-Netdata-Host: $host +X-Netdata-Role: $roles This is a MIME-encoded multipart message @@ -2638,120 +2702,742 @@ EOF else +now=$(date "+%s") + +if [ -n "$total_warn_alarms" ]; then + while read -d, -r pair; do + IFS='=' read -r key val <<<"$pair" + + date_w=$(date --date=@${val} "${date_format}" 2>/dev/null) + [ -z "${date_w}" ] && date_w=$(date "${date_format}" 2>/dev/null) + [ -z "${date_w}" ] && date_w=$(date --date=@${val} 2>/dev/null) + [ -z "${date_w}" ] && date_w=$(date 2>/dev/null) + + elapsed=$((now - val)) + + duration4human ${elapsed} >/dev/null + elapsed_txt="${REPLY}" + + WARN_ALARMS+=" +
+ + + + + + +
+ +
+ + + + + + + + + +
+
${key}
+
+
${date_w}
+
+
+ +
+ + + + + + +
+ + + + + + +
+
+ Warning for ${elapsed_txt} +
+
+
+
+ +
+
+ " + + done <<<"$total_warn_alarms," +fi + +if [ -n "$total_crit_alarms" ]; then + while read -d, -r pair; do + IFS='=' read -r key val <<<"$pair" + + date_c=$(date --date=@${val} "${date_format}" 2>/dev/null) + [ -z "${date_c}" ] && date_c=$(date "${date_format}" 2>/dev/null) + [ -z "${date_c}" ] && date_c=$(date --date=@${val} 2>/dev/null) + [ -z "${date_c}" ] && date_c=$(date 2>/dev/null) + + elapsed=$((now - val)) + + duration4human ${elapsed} >/dev/null + elapsed_txt="${REPLY}" + + CRIT_ALARMS+=" +
+ + + + + + +
+ +
+ + + + + + + + + +
+
${key}
+
+
${date_c}
+
+
+ +
+ + + + + + +
+ + + + + + +
+
+ Critical for ${elapsed_txt} +
+
+
+
+ +
+
+ " + + done <<<"$total_crit_alarms," +fi + +if [ -n "$edit_command_line" ]; then + IFS='=' read -r edit_command line <<<"$edit_command_line" +fi + IFS='' read -r -d '' email_html_part < - - - - - - - - - -
-
- + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ + + + + +
+ +
+ + + + + + +
+ - + + +
-
netdata notification
-
+ Netdata Logo +
+
+
+ +
+ + + + + + +
+ + - + + +
-

${host} ${status_message}

+
+
Notification
+
+
+
+ +
+
+ +
+ + + + + + +
+ +
+ + + + + + +
+ +
+ + + + + + +
+
${name}
+
+
+ +
+ + + + + + +
+ + + + + + +
+ +
+
+
+ +
+
+ +
+ + + + + + +
+ +
+ + + + + + +
+
on ${host}
+
+ +
+
+ +
+ + + + + + +
+ +
+ + + + + + +
+
${value_string} +
+
+
+ +
+
+ +
+ + + + + + +
+ +
+ + + + + + +
+
Details: ${info}
+
+
+ +
+
+ +
+ + + + + + +
+ +
+ + + + + + +
+ + + + +
+

+ GO TO CHART +

+
+
+
+ +
+
+ +
+
+ +
+ +
+ + + + + + +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
Chart: + ${chart}
+
+
Family: + ${family}
+
+
${rich_status_raised_for}
+
+

+

+ +
+
On + ${date}
+
+
By: + ${host}
+
+
Global time: + ${date_utc}
+
+

+

+ +
+
Classification: + ${classification}
+
+
Role: + ${roles}
+
+
+ +
+
+ +
+ + + + + + +
+ +
+ + + + + + +
+ + + + + + +
+ + + + + + +
+ +
+
+
+
+ +
+ + + + + + +
+ + + + + + + + +
+
Want to know more about this alert?
+
+
Discuss and troubleshoot with others on the Netdata community forums
+
+
+
+ +
+
+ +
+ + + + + + +
+ +
+ + + + + + +
+ + - + + +
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- ${chart} - Chart -
- ${alarm}${info_html} - Alarm -
- ${family} - Family -
- ${severity} - Severity -
${date} - ${raised_for_html} Time -
- ${calc_expression} - Evaluated Expression -
- ${calc_param_values} - Expression Variables -
- The host has ${total_warnings} WARNING and ${total_critical} CRITICAL alarm(s) raised. -
- View Netdata -
The source of this alarm is line ${src}
(alarms are configurable, edit this file to adapt the alarm to your needs) -
Sent by - netdata, the real-time performance and health monitoring, on ${host}. -
-
+
+ + + + + + +
+ +
+
+
+
+ +
+ + + + + + +
+ + + + + + + + + + + + +
+
Need to configure this alert?
+
+
Edit this alert's configuration file by logging into $s_host and running the following command:
+
+
${edit_command}
+ The alarm to edit is at line {${line}}
+
+
+
+ +
+
+ +
+ + + + + + +
+ +
+ + + + + + +
+ +
+ + + + + + +
+
The node has + ${total_warnings} warning + and + ${total_critical} critical + additional active alert(s)
+
+ +
+
+ ${CRIT_ALARMS} + ${WARN_ALARMS} + +
+
+ +
+ + + + + + +
+ +
+ + + +
+ + + + -
+
© Netdata 2021 - The real-time performance and health monitoring
+
- +
+
+
+
+ + + + + + EOF send_email < Tags --> Manually applied tags create the Tag -# The NetData alarm will be sent as a Dynatrace Event to be correlated with all those hosts tagged with this Tag +# The Netdata alarm will be sent as a Dynatrace Event to be correlated with all those hosts tagged with this Tag # you created. # Required DYNATRACE_TAG_VALUE="" # Change this to what you want -DYNATRACE_ANNOTATION_TYPE="NetData Alarm" +DYNATRACE_ANNOTATION_TYPE="Netdata Alarm" # This can be CUSTOM_INFO, CUSTOM_ANNOTATION, CUSTOM_CONFIGURATION, CUSTOM_DEPLOYMENT # Applying default value diff --git a/health/notifications/syslog/README.md b/health/notifications/syslog/README.md index 456394d2f..360f6844d 100644 --- a/health/notifications/syslog/README.md +++ b/health/notifications/syslog/README.md @@ -17,7 +17,7 @@ netdata WARNING on hostname at Tue Apr 3 09:00:00 EDT 2018: disk_space._ out of System log targets are configured as recipients in [`/etc/netdata/health_alarm_notify.conf`](https://github.com/netdata/netdata/blob/36bedc044584dea791fd29455bdcd287c3306cb2/conf.d/health_alarm_notify.conf#L534) (to edit it on your system run `/etc/netdata/edit-config health_alarm_notify.conf`). -You can als configure per-role targets in the same file a bit further down. +You can also configure per-role targets in the same file a bit further down. Targets are defined as follows: -- cgit v1.2.3