diff options
Diffstat (limited to '')
-rw-r--r-- | health/Makefile.am | 1 | ||||
-rw-r--r-- | health/REFERENCE.md | 35 | ||||
-rw-r--r-- | health/health.c | 81 | ||||
-rw-r--r-- | health/health.d/boinc.conf | 4 | ||||
-rw-r--r-- | health/health.d/btrfs.conf | 9 | ||||
-rw-r--r-- | health/health.d/cockroachdb.conf | 10 | ||||
-rw-r--r-- | health/health.d/disks.conf | 10 | ||||
-rw-r--r-- | health/health.d/exporting.conf | 2 | ||||
-rw-r--r-- | health/health.d/httpcheck.conf | 5 | ||||
-rw-r--r-- | health/health.d/ioping.conf | 1 | ||||
-rw-r--r-- | health/health.d/mdstat.conf | 2 | ||||
-rw-r--r-- | health/health.d/net.conf | 18 | ||||
-rw-r--r-- | health/health.d/nvme.conf | 1 | ||||
-rw-r--r-- | health/health.d/ping.conf | 3 | ||||
-rw-r--r-- | health/health.d/plugin.conf | 11 | ||||
-rw-r--r-- | health/health.d/portcheck.conf | 3 | ||||
-rw-r--r-- | health/health.d/redis.conf | 4 | ||||
-rw-r--r-- | health/health.d/vsphere.conf | 8 | ||||
-rw-r--r-- | health/health.d/web_log.conf | 12 | ||||
-rw-r--r-- | health/health.d/windows.conf | 4 | ||||
-rw-r--r-- | health/health.h | 4 | ||||
-rw-r--r-- | health/health_config.c | 93 | ||||
-rw-r--r-- | health/health_json.c | 170 | ||||
-rw-r--r-- | health/health_log.c | 9 |
24 files changed, 200 insertions, 300 deletions
diff --git a/health/Makefile.am b/health/Makefile.am index ea1b6e961..0ef55c75e 100644 --- a/health/Makefile.am +++ b/health/Makefile.am @@ -69,6 +69,7 @@ dist_healthconfig_DATA = \ health.d/nvme.conf \ health.d/nut.conf \ health.d/pihole.conf \ + health.d/plugin.conf \ health.d/ping.conf \ health.d/postgres.conf \ health.d/portcheck.conf \ diff --git a/health/REFERENCE.md b/health/REFERENCE.md index b95dc852e..a36edd8cf 100644 --- a/health/REFERENCE.md +++ b/health/REFERENCE.md @@ -241,7 +241,8 @@ Netdata parses the following lines. Beneath the table is an in-depth explanation | [`delay`](#alarm-line-delay) | no | Optional hysteresis settings to prevent floods of notifications. | | [`repeat`](#alarm-line-repeat) | no | The interval for sending notifications when an alarm is in WARNING or CRITICAL mode. | | [`options`](#alarm-line-options) | no | Add an option to not clear alarms. | -| [`host labels`](#alarm-line-host-labels) | no | List of labels present on a host. | +| [`host labels`](#alarm-line-host-labels) | no | Restrict an alarm or template to a list of matching labels present on a host. | +| [`chart labels`](#alarm-line-chart-labels) | no | Restrict an alarm or template to a list of matching labels present on a host. | | [`info`](#alarm-line-info) | no | A brief description of the alarm. | The `alarm` or `template` line must be the first line of any entity. @@ -446,6 +447,9 @@ For example, you can create a template on the `disk.io` context, but filter it t families: sda sdb ``` +Please note that the use of the `families` filter is planned to be deprecated in upcoming Netdata releases. +Please use [`chart labels`](#alarm-line-chart-labels) instead. + #### Alarm line `lookup` This line makes a database lookup to find a value. This result of this lookup is available as `$this`. @@ -696,6 +700,35 @@ host labels: installed = 201* See our [simple patterns docs](https://github.com/netdata/netdata/blob/master/libnetdata/simple_pattern/README.md) for more examples. +#### Alarm line `chart labels` + +Similar to host labels, the `chart labels` key can be used to filter if an alarm will load or not for a specific chart, based on +whether these chart labels match or not. + +The list of chart labels present on each chart can be obtained from http://localhost:19999/api/v1/charts?all + +For example, each `disk_space` chart defines a chart label called `mount_point` with each instance of this chart having +a value there of which mount point it monitors. + +If you have an e.g. external disk mounted on `/mnt/disk1` and you don't wish any related disk space alerts running for +it (but you do for all other mount points), you can add the following to the alert's configuration: + +```yaml +chart labels: mount_point=!/mnt/disk1 *` +``` + +The `chart labels` is a space-separated list that accepts simple patterns. If you use multiple different chart labels, +then the result is an OR between them. i.e. the following: + +```yaml +chart labels: mount_point=/mnt/disk1 device=sda` +``` + +Will create the alert if the `mount_point` is `/mnt/disk1` or the `device` is `sda`. Furthermore, if a chart label name +is specified that does not exist in the chart, the chart won't be matched. + +See our [simple patterns docs](https://github.com/netdata/netdata/blob/master/libnetdata/simple_pattern/README.md) for more examples. + #### Alarm line `info` The info field can contain a small piece of text describing the alarm or template. This will be rendered in diff --git a/health/health.c b/health/health.c index 5c2b85bc5..df4798a20 100644 --- a/health/health.c +++ b/health/health.c @@ -412,17 +412,13 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { // find the previous notification for the same alarm // which we have run the exec script // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set + RRDCALC_STATUS last_executed_status = -3; if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) { - uint32_t id = ae->alarm_id; - ALARM_ENTRY *t; - for(t = ae->next; t ; t = t->next) { - if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN) - break; - } + int ret = sql_health_get_last_executed_event(host, ae, &last_executed_status); - if(likely(t)) { + if (likely(ret == 1)) { // we have executed this alarm notification in the past - if(t && t->new_status == ae->new_status) { + if(last_executed_status == ae->new_status) { // don't send the notification for the same status again debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae) , rrdcalc_status2string(ae->new_status)); @@ -561,6 +557,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS; ae->exec_spawn_serial = spawn_enq_cmd(command_to_run); enqueue_alarm_notify_in_progress(ae); + health_alarm_log_save(host, ae); } else { error("Failed to format command arguments"); } @@ -628,35 +625,32 @@ static inline void health_alarm_log_process(RRDHOST *host) { // remember this for the next iteration host->health_last_processed_id = first_waiting; - bool cleanup_excess_log_entries = host->health_log.count > host->health_log.max; - - if (!cleanup_excess_log_entries) - return; - - // cleanup excess entries in the log + //delete those that are updated, no in progress execution, and is not repeating netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock); - ALARM_ENTRY *last = NULL; - unsigned int count = host->health_log.max * 2 / 3; - for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ; - - if(ae && last && last->next == ae) - last->next = NULL; - else - ae = NULL; - - while(ae) { - debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id); - - ALARM_ENTRY *t = ae->next; - - if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) { - health_alarm_wait_for_execution(ae); + ALARM_ENTRY *prev = host->health_log.alarms; + for(ae = host->health_log.alarms; ae ; ae = ae->next) { + + if((likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) && + (ae->flags & HEALTH_ENTRY_FLAG_UPDATED) && + (ae->flags & HEALTH_ENTRY_FLAG_SAVED) && + !(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS)) + || + ((ae->new_status == RRDCALC_STATUS_REMOVED) && + (ae->flags & HEALTH_ENTRY_FLAG_SAVED) && + (ae->when + 3600 < now_realtime_sec()))) + { + + if (ae == host->health_log.alarms) { + host->health_log.alarms = ae->next; + prev = ae->next; + } else { + prev->next = ae->next; + } health_alarm_log_free_one_nochecks_nounlink(ae); - host->health_log.count--; - } - - ae = t; + ae = prev; + } else + prev = ae; } netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); @@ -904,8 +898,24 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { return 0; } +static void sql_health_postpone_queue_removed(RRDHOST *host __maybe_unused) { +#ifdef ENABLE_ACLK + if (netdata_cloud_setting) { + struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config; + if (unlikely(!wc)) { + return; + } + + if (wc->alert_queue_removed >= 1) { + wc->alert_queue_removed+=6; + } + } +#endif +} + static void health_execute_delayed_initializations(RRDHOST *host) { RRDSET *st; + bool must_postpone = false; if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION)) return; rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION); @@ -941,8 +951,11 @@ static void health_execute_delayed_initializations(RRDHOST *host) { rrdvar_store_for_chart(host, st); } rrddim_foreach_done(rd); + must_postpone = true; } rrdset_foreach_done(st); + if (must_postpone) + sql_health_postpone_queue_removed(host); } /** diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf index 7d7a4fdae..6f37787d7 100644 --- a/health/health.d/boinc.conf +++ b/health/health.d/boinc.conf @@ -8,7 +8,6 @@ component: BOINC os: * hosts: * - families: * lookup: average -10m unaligned of comperror units: tasks every: 1m @@ -26,7 +25,6 @@ component: BOINC component: BOINC os: * hosts: * - families: * lookup: average -10m unaligned of upload_failed units: tasks every: 1m @@ -44,7 +42,6 @@ component: BOINC component: BOINC os: * hosts: * - families: * lookup: average -10m unaligned of total units: tasks every: 1m @@ -62,7 +59,6 @@ component: BOINC component: BOINC os: * hosts: * - families: * lookup: average -10m unaligned of active calc: ($boinc_total_tasks >= 1) ? ($this) : (inf) units: tasks diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf index ab63ff28d..97b7a3a94 100644 --- a/health/health.d/btrfs.conf +++ b/health/health.d/btrfs.conf @@ -6,7 +6,6 @@ component: File system os: * hosts: * - families: * calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free)) units: % every: 10s @@ -23,7 +22,6 @@ component: File system component: File system os: * hosts: * - families: * calc: $used * 100 / ($used + $free) units: % every: 10s @@ -40,7 +38,6 @@ component: File system component: File system os: * hosts: * - families: * calc: ($used + $reserved) * 100 / ($used + $free + $reserved) units: % every: 10s @@ -57,7 +54,6 @@ component: File system component: File system os: * hosts: * - families: * calc: $used * 100 / ($used + $free) units: % every: 10s @@ -74,7 +70,6 @@ component: File system component: File system os: * hosts: * - families: * units: errors lookup: max -10m every 1m of read_errs warn: $this > 0 @@ -89,7 +84,6 @@ component: File system component: File system os: * hosts: * - families: * units: errors lookup: max -10m every 1m of write_errs warn: $this > 0 @@ -104,7 +98,6 @@ component: File system component: File system os: * hosts: * - families: * units: errors lookup: max -10m every 1m of flush_errs warn: $this > 0 @@ -119,7 +112,6 @@ component: File system component: File system os: * hosts: * - families: * units: errors lookup: max -10m every 1m of corruption_errs warn: $this > 0 @@ -134,7 +126,6 @@ component: File system component: File system os: * hosts: * - families: * units: errors lookup: max -10m every 1m of generation_errs warn: $this > 0 diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf index 1f227841e..09e4f9d40 100644 --- a/health/health.d/cockroachdb.conf +++ b/health/health.d/cockroachdb.conf @@ -6,7 +6,7 @@ class: Utilization type: Database component: CockroachDB - calc: $capacity_used_percent + calc: $total units: % every: 10s warn: $this > (($status >= $WARNING) ? (80) : (85)) @@ -20,7 +20,7 @@ component: CockroachDB class: Utilization type: Database component: CockroachDB - calc: $capacity_usable_used_percent + calc: $usable units: % every: 10s warn: $this > (($status >= $WARNING) ? (80) : (85)) @@ -36,7 +36,7 @@ component: CockroachDB class: Errors type: Database component: CockroachDB - calc: $ranges_unavailable + calc: $unavailable units: num every: 10s warn: $this > 0 @@ -49,7 +49,7 @@ component: CockroachDB class: Errors type: Database component: CockroachDB - calc: $ranges_underreplicated + calc: $under_replicated units: num every: 10s warn: $this > 0 @@ -64,7 +64,7 @@ component: CockroachDB class: Utilization type: Database component: CockroachDB - calc: $sys_fd_open/$sys_fd_softlimit * 100 + calc: $open/$sys_fd_softlimit * 100 units: % every: 10s warn: $this > 80 diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf index fd207fbc1..7bd4f120c 100644 --- a/health/health.d/disks.conf +++ b/health/health.d/disks.conf @@ -16,7 +16,7 @@ component: Disk os: linux freebsd hosts: * - families: !/dev !/dev/* !/run !/run/* * +chart labels: mount_point=!/dev !/dev/* !/run !/run/* * calc: $used * 100 / ($avail + $used) units: % every: 1m @@ -33,7 +33,7 @@ component: Disk component: Disk os: linux freebsd hosts: * - families: !/dev !/dev/* !/run !/run/* * +chart labels: mount_point=!/dev !/dev/* !/run !/run/* * calc: $used * 100 / ($avail + $used) units: % every: 1m @@ -59,7 +59,6 @@ component: Disk # on: disk.space # os: linux freebsd # hosts: * -# families: * # lookup: min -10m at -50m unaligned of avail # calc: ($this - $avail) / (($now - $after) / 3600) # every: 1m @@ -75,7 +74,6 @@ component: Disk # on: disk.space # os: linux freebsd # hosts: * -# families: * # calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) # units: hours # every: 10s @@ -101,7 +99,6 @@ component: Disk # on: disk.inodes # os: linux freebsd # hosts: * -# families: * # lookup: min -10m at -50m unaligned of avail # calc: ($this - $avail) / (($now - $after) / 3600) # every: 1m @@ -116,7 +113,6 @@ component: Disk # on: disk.inodes # os: linux freebsd # hosts: * -# families: * # calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf) # units: hours # every: 10s @@ -141,7 +137,6 @@ component: Disk component: Disk os: linux freebsd hosts: * - families: * lookup: average -10m unaligned units: % every: 1m @@ -163,7 +158,6 @@ component: Disk component: Disk os: linux hosts: * - families: * lookup: average -10m unaligned units: ms every: 1m diff --git a/health/health.d/exporting.conf b/health/health.d/exporting.conf index 06f398c6e..f1030a317 100644 --- a/health/health.d/exporting.conf +++ b/health/health.d/exporting.conf @@ -1,6 +1,5 @@ template: exporting_last_buffering - families: * on: exporting_data_size class: Latency type: Netdata @@ -15,7 +14,6 @@ component: Exporting engine to: dba template: exporting_metrics_sent - families: * on: exporting_data_size class: Workload type: Netdata diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf index 2008b000d..81748b9e0 100644 --- a/health/health.d/httpcheck.conf +++ b/health/health.d/httpcheck.conf @@ -1,7 +1,6 @@ # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges template: httpcheck_web_service_up - families: * on: httpcheck.status class: Utilization type: Web Server @@ -14,7 +13,6 @@ component: HTTP endpoint to: silent template: httpcheck_web_service_bad_content - families: * on: httpcheck.status class: Workload type: Web Server @@ -29,7 +27,6 @@ component: HTTP endpoint to: webmaster template: httpcheck_web_service_bad_status - families: * on: httpcheck.status class: Workload type: Web Server @@ -44,7 +41,6 @@ component: HTTP endpoint to: webmaster template: httpcheck_web_service_timeouts - families: * on: httpcheck.status class: Latency type: Web Server @@ -59,7 +55,6 @@ component: HTTP endpoint to: webmaster template: httpcheck_web_service_no_connection - families: * on: httpcheck.status class: Errors type: Other diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf index 8b498ad3c..2786cbd62 100644 --- a/health/health.d/ioping.conf +++ b/health/health.d/ioping.conf @@ -1,5 +1,4 @@ template: ioping_disk_latency - families: * on: ioping.latency class: Latency type: System diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf index ed980a26a..b90455a58 100644 --- a/health/health.d/mdstat.conf +++ b/health/health.d/mdstat.conf @@ -29,7 +29,7 @@ component: RAID class: Errors type: System component: RAID - families: !*(raid1) !*(raid10) * +chart labels: raid_level=!raid1 !raid10 * units: unsynchronized blocks calc: $count every: 60s diff --git a/health/health.d/net.conf b/health/health.d/net.conf index a0723f303..08a4eecb4 100644 --- a/health/health.d/net.conf +++ b/health/health.d/net.conf @@ -11,7 +11,6 @@ component: Network os: * hosts: * - families: * calc: ( $nic_speed_max > 0 ) ? ( $nic_speed_max) : ( nan ) units: Mbit every: 10s @@ -24,7 +23,6 @@ component: Network component: Network os: linux hosts: * - families: * lookup: average -1m unaligned absolute of received calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed)) : ( nan ) units: % @@ -41,7 +39,6 @@ component: Network component: Network os: linux hosts: * - families: * lookup: average -1m unaligned absolute of sent calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed)) : ( nan ) units: % @@ -68,7 +65,6 @@ component: Network component: Network os: linux hosts: * - families: * lookup: sum -10m unaligned absolute of inbound units: packets every: 1m @@ -81,7 +77,6 @@ component: Network component: Network os: linux hosts: * - families: * lookup: sum -10m unaligned absolute of outbound units: packets every: 1m @@ -94,7 +89,7 @@ component: Network component: Network os: linux hosts: * - families: !wl* * +chart labels: device=!wl* * lookup: sum -10m unaligned absolute of received calc: (($inbound_packets_dropped != nan AND $this > 10000) ? ($inbound_packets_dropped * 100 / $this) : (0)) units: % @@ -111,7 +106,7 @@ component: Network component: Network os: linux hosts: * - families: !wl* * +chart labels: device=!wl* * lookup: sum -10m unaligned absolute of sent calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0)) units: % @@ -128,7 +123,7 @@ component: Network component: Network os: linux hosts: * - families: wl* +chart labels: device=wl* lookup: sum -10m unaligned absolute of received calc: (($inbound_packets_dropped != nan AND $this > 10000) ? ($inbound_packets_dropped * 100 / $this) : (0)) units: % @@ -145,7 +140,7 @@ component: Network component: Network os: linux hosts: * - families: wl* +chart labels: device=wl* lookup: sum -10m unaligned absolute of sent calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0)) units: % @@ -165,7 +160,6 @@ component: Network component: Network os: freebsd hosts: * - families: * lookup: sum -10m unaligned absolute of inbound units: errors every: 1m @@ -181,7 +175,6 @@ component: Network component: Network os: freebsd hosts: * - families: * lookup: sum -10m unaligned absolute of outbound units: errors every: 1m @@ -205,7 +198,6 @@ component: Network component: Network os: linux hosts: * - families: * lookup: sum -10m unaligned absolute units: errors every: 1m @@ -230,7 +222,6 @@ component: Network component: Network os: linux freebsd hosts: * - families: * lookup: average -1m unaligned of received units: packets every: 10s @@ -243,7 +234,6 @@ component: Network component: Network os: linux freebsd hosts: * - families: * lookup: average -10s unaligned of received calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) every: 10s diff --git a/health/health.d/nvme.conf b/health/health.d/nvme.conf index b7c0e6fd4..742ffbc93 100644 --- a/health/health.d/nvme.conf +++ b/health/health.d/nvme.conf @@ -1,7 +1,6 @@ # you can disable an alarm notification by setting the 'to' line to: silent template: nvme_device_critical_warnings_state - families: * on: nvme.device_critical_warnings_state class: Errors type: System diff --git a/health/health.d/ping.conf b/health/health.d/ping.conf index fa8213ad3..b8d39bbad 100644 --- a/health/health.d/ping.conf +++ b/health/health.d/ping.conf @@ -1,7 +1,6 @@ # you can disable an alarm notification by setting the 'to' line to: silent template: ping_host_reachable - families: * on: ping.host_packet_loss class: Errors type: Other @@ -16,7 +15,6 @@ component: Network to: sysadmin template: ping_packet_loss - families: * on: ping.host_packet_loss class: Errors type: Other @@ -33,7 +31,6 @@ component: Network to: sysadmin template: ping_host_latency - families: * on: ping.host_rtt class: Latency type: Other diff --git a/health/health.d/plugin.conf b/health/health.d/plugin.conf new file mode 100644 index 000000000..0a891db79 --- /dev/null +++ b/health/health.d/plugin.conf @@ -0,0 +1,11 @@ + template: plugin_availability_status + on: netdata.plugin_availability_status + class: Errors + type: Netdata + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : (20 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: the amount of time that ${label:_collect_plugin} did not report its availability status + to: sysadmin diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf index e8908404c..34550ea02 100644 --- a/health/health.d/portcheck.conf +++ b/health/health.d/portcheck.conf @@ -1,7 +1,6 @@ # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges template: portcheck_service_reachable - families: * on: portcheck.status class: Workload type: Other @@ -14,7 +13,6 @@ component: TCP endpoint to: silent template: portcheck_connection_timeouts - families: * on: portcheck.status class: Errors type: Other @@ -29,7 +27,6 @@ component: TCP endpoint to: sysadmin template: portcheck_connection_fails - families: * on: portcheck.status class: Errors type: Other diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf index 34d00b5df..a58fa34d1 100644 --- a/health/health.d/redis.conf +++ b/health/health.d/redis.conf @@ -1,7 +1,6 @@ # you can disable an alarm notification by setting the 'to' line to: silent template: redis_connections_rejected - families: * on: redis.connections class: Errors type: KV Storage @@ -15,7 +14,6 @@ component: Redis to: dba template: redis_bgsave_broken - families: * on: redis.bgsave_health class: Errors type: KV Storage @@ -28,7 +26,6 @@ component: Redis to: dba template: redis_bgsave_slow - families: * on: redis.bgsave_now class: Latency type: KV Storage @@ -43,7 +40,6 @@ component: Redis to: dba template: redis_master_link_down - families: * on: redis.master_link_down_since_time class: Errors type: KV Storage diff --git a/health/health.d/vsphere.conf b/health/health.d/vsphere.conf index d8fc899b9..1d8be6cb5 100644 --- a/health/health.d/vsphere.conf +++ b/health/health.d/vsphere.conf @@ -43,7 +43,6 @@ component: Memory type: Virtual Machine component: Network hosts: * - families: * lookup: sum -10m unaligned absolute match-names of rx units: packets every: 1m @@ -55,7 +54,6 @@ component: Network type: Virtual Machine component: Network hosts: * - families: * lookup: sum -10m unaligned absolute match-names of tx units: packets every: 1m @@ -69,7 +67,6 @@ component: Network type: Virtual Machine component: Network hosts: * - families: * lookup: sum -10m unaligned absolute match-names of rx calc: (($vsphere_inbound_packets_errors != nan AND $this > 1000) ? ($vsphere_inbound_packets_errors * 100 / $this) : (0)) units: % @@ -85,7 +82,6 @@ component: Network type: Virtual Machine component: Network hosts: * - families: * lookup: sum -10m unaligned absolute match-names of tx calc: (($vsphere_outbound_packets_errors != nan AND $this > 1000) ? ($vsphere_outbound_packets_errors * 100 / $this) : (0)) units: % @@ -121,7 +117,6 @@ component: CPU type: Virtual Machine component: Network hosts: * - families: * lookup: sum -10m unaligned absolute match-names of rx units: packets every: 1m @@ -133,7 +128,6 @@ component: Network type: Virtual Machine component: Network hosts: * - families: * lookup: sum -10m unaligned absolute match-names of tx units: packets every: 1m @@ -147,7 +141,6 @@ component: Network type: Virtual Machine component: Network hosts: * - families: * lookup: sum -10m unaligned absolute match-names of rx calc: (($vsphere_inbound_packets_dropped != nan AND $this > 1000) ? ($vsphere_inbound_packets_dropped * 100 / $this) : (0)) units: % @@ -163,7 +156,6 @@ component: Network type: Virtual Machine component: Network hosts: * - families: * lookup: sum -10m unaligned absolute match-names of tx calc: (($vsphere_outbound_packets_dropped != nan AND $this > 1000) ? ($vsphere_outbound_packets_dropped * 100 / $this) : (0)) units: % diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf index c33c4664c..3fd01831b 100644 --- a/health/health.d/web_log.conf +++ b/health/health.d/web_log.conf @@ -13,7 +13,6 @@ class: Workload type: Web Server component: Web log - families: * lookup: sum -1m unaligned calc: ($this == 0)?(1):($this) units: requests @@ -25,7 +24,6 @@ component: Web log class: Errors type: Web Server component: Web log - families: * lookup: sum -1m unaligned of unmatched calc: $this * 100 / $web_log_1m_total_requests units: % @@ -50,7 +48,6 @@ component: Web log class: Workload type: Web Server component: Web log - families: * lookup: sum -1m unaligned calc: ($this == 0)?(1):($this) units: requests @@ -62,7 +59,6 @@ component: Web log class: Workload type: Web Server component: Web log - families: * lookup: sum -1m unaligned of success calc: $this * 100 / $web_log_1m_requests units: % @@ -78,7 +74,6 @@ component: Web log class: Workload type: Web Server component: Web log - families: * lookup: sum -1m unaligned of redirect calc: $this * 100 / $web_log_1m_requests units: % @@ -93,7 +88,6 @@ component: Web log class: Errors type: Web Server component: Web log - families: * lookup: sum -1m unaligned of bad calc: $this * 100 / $web_log_1m_requests units: % @@ -108,7 +102,6 @@ component: Web log class: Errors type: Web Server component: Web log - families: * lookup: sum -1m unaligned of error calc: $this * 100 / $web_log_1m_requests units: % @@ -134,7 +127,6 @@ component: Web log class: Latency type: System component: Web log - families: * lookup: average -10m unaligned of avg units: ms every: 30s @@ -145,7 +137,6 @@ component: Web log class: Latency type: Web Server component: Web log - families: * lookup: average -1m unaligned of avg units: ms every: 10s @@ -174,7 +165,6 @@ component: Web log class: Workload type: Web Server component: Web log - families: * lookup: average -5m at -5m unaligned of success units: requests/s every: 30s @@ -185,7 +175,6 @@ component: Web log class: Workload type: Web Server component: Web log - families: * lookup: average -5m unaligned of success units: requests/s every: 30s @@ -196,7 +185,6 @@ component: Web log class: Workload type: Web Server component: Web log - families: * calc: ($web_log_5m_successful_old > 0)?($web_log_5m_successful * 100 / $web_log_5m_successful_old):(100) units: % every: 30s diff --git a/health/health.d/windows.conf b/health/health.d/windows.conf index d678ac3ae..d4bc7639c 100644 --- a/health/health.d/windows.conf +++ b/health/health.d/windows.conf @@ -62,7 +62,6 @@ component: Memory component: Network os: linux hosts: * - families: * lookup: sum -10m unaligned absolute match-names of inbound units: packets every: 1m @@ -78,7 +77,6 @@ component: Network component: Network os: linux hosts: * - families: * lookup: sum -10m unaligned absolute match-names of outbound units: packets every: 1m @@ -94,7 +92,6 @@ component: Network component: Network os: linux hosts: * - families: * lookup: sum -10m unaligned absolute match-names of inbound units: packets every: 1m @@ -110,7 +107,6 @@ component: Network component: Network os: linux hosts: * - families: * lookup: sum -10m unaligned absolute match-names of outbound units: packets every: 1m diff --git a/health/health.h b/health/health.h index 902e36c62..c36aabac7 100644 --- a/health/health.h +++ b/health/health.h @@ -41,7 +41,6 @@ void health_reload(void); void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* context, RRDCALC_STATUS status); void health_alarms2json(RRDHOST *host, BUFFER *wb, int all); void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all); -void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart); void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf); void health_api_v1_chart_custom_variables2json(RRDSET *st, BUFFER *buf); @@ -87,11 +86,10 @@ void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae); void *health_cmdapi_thread(void *ptr); -void health_label_log_save(RRDHOST *host); - char *health_edit_command_from_source(const char *source); void sql_refresh_hashes(void); void health_add_host_labels(void); +void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix); #endif //NETDATA_HEALTH_H diff --git a/health/health_config.c b/health/health_config.c index 38857fc9a..a11fd51cd 100644 --- a/health/health_config.c +++ b/health/health_config.c @@ -32,6 +32,7 @@ #define HEALTH_REPEAT_KEY "repeat" #define HEALTH_HOST_LABEL_KEY "host labels" #define HEALTH_FOREACH_KEY "foreach" +#define HEALTH_CHART_LABEL_KEY "chart labels" static inline int health_parse_delay( size_t line, const char *filename, char *string, @@ -192,6 +193,50 @@ static inline int isvariableterm(const char s) { return 1; } +// If needed, add a prefix key to all possible values in the range +static inline char *health_config_add_key_to_values(char *value) { + BUFFER *wb = buffer_create(HEALTH_CONF_MAX_LINE + 1, NULL); + char key[HEALTH_CONF_MAX_LINE + 1]; + char data[HEALTH_CONF_MAX_LINE + 1]; + + char *s = value; + size_t i = 0; + + key[0] = '\0'; + while(*s) { + if (*s == '=') { + //hold the key + data[i]='\0'; + strncpyz(key, data, HEALTH_CONF_MAX_LINE); + i=0; + } else if (*s == ' ') { + data[i]='\0'; + if (data[0]=='!') + buffer_snprintf(wb, HEALTH_CONF_MAX_LINE, "!%s=%s ", key, data + 1); + else + buffer_snprintf(wb, HEALTH_CONF_MAX_LINE, "%s=%s ", key, data); + i=0; + } else { + data[i++] = *s; + } + s++; + } + + data[i]='\0'; + if (data[0]) { + if (data[0]=='!') + buffer_snprintf(wb, HEALTH_CONF_MAX_LINE, "!%s=%s ", key, data + 1); + else + buffer_snprintf(wb, HEALTH_CONF_MAX_LINE, "%s=%s ", key, data); + } + + char *final = mallocz(HEALTH_CONF_MAX_LINE + 1); + strncpyz(final, buffer_tostring(wb), HEALTH_CONF_MAX_LINE); + buffer_free(wb); + + return final; +} + static inline void parse_variables_and_store_in_health_rrdvars(char *value, size_t len) { const char *s = value; char buffer[RRDVAR_MAX_LENGTH]; @@ -453,6 +498,7 @@ static inline void alert_config_free(struct alert_config *cfg) string_freez(cfg->host_labels); string_freez(cfg->p_db_lookup_dimensions); string_freez(cfg->p_db_lookup_method); + string_freez(cfg->chart_labels); freez(cfg); } @@ -489,7 +535,8 @@ static int health_readfile(const char *filename, void *data) { hash_delay = 0, hash_options = 0, hash_repeat = 0, - hash_host_label = 0; + hash_host_label = 0, + hash_chart_label = 0; char buffer[HEALTH_CONF_MAX_LINE + 1]; @@ -521,6 +568,7 @@ static int health_readfile(const char *filename, void *data) { hash_options = simple_uhash(HEALTH_OPTIONS_KEY); hash_repeat = simple_uhash(HEALTH_REPEAT_KEY); hash_host_label = simple_uhash(HEALTH_HOST_LABEL_KEY); + hash_chart_label = simple_uhash(HEALTH_CHART_LABEL_KEY); } FILE *fp = fopen(filename, "r"); @@ -937,6 +985,27 @@ static int health_readfile(const char *filename, void *data) { rc->module_match = string_strdupz(value); rc->module_pattern = simple_pattern_create(rrdcalc_module_match(rc), NULL, SIMPLE_PATTERN_EXACT, true); } + else if(hash == hash_chart_label && !strcasecmp(key, HEALTH_CHART_LABEL_KEY)) { + alert_cfg->chart_labels = string_strdupz(value); + if(rc->chart_labels) { + if(strcmp(rrdcalc_chart_labels(rc), value) != 0) + error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'.", + line, filename, rrdcalc_name(rc), key, value, value); + + string_freez(rc->chart_labels); + simple_pattern_free(rc->chart_labels_pattern); + } + + { + char *tmp = simple_pattern_trim_around_equal(value); + char *tmp_2 = health_config_add_key_to_values(tmp); + rc->chart_labels = string_strdupz(tmp_2); + freez(tmp); + freez(tmp_2); + } + rc->chart_labels_pattern = simple_pattern_create(rrdcalc_chart_labels(rc), NULL, SIMPLE_PATTERN_EXACT, + true); + } else { error("Health configuration at line %zu of file '%s' for alarm '%s' has unknown key '%s'.", line, filename, rrdcalc_name(rc), key); @@ -1186,9 +1255,31 @@ static int health_readfile(const char *filename, void *data) { rt->host_labels = string_strdupz(tmp); freez(tmp); } + rt->host_labels_pattern = simple_pattern_create(rrdcalctemplate_host_labels(rt), NULL, SIMPLE_PATTERN_EXACT, true); } + else if(hash == hash_chart_label && !strcasecmp(key, HEALTH_CHART_LABEL_KEY)) { + alert_cfg->chart_labels = string_strdupz(value); + if(rt->chart_labels) { + if(strcmp(rrdcalctemplate_chart_labels(rt), value) != 0) + error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_chart_labels(rt), value, value); + + string_freez(rt->chart_labels); + simple_pattern_free(rt->chart_labels_pattern); + } + + { + char *tmp = simple_pattern_trim_around_equal(value); + char *tmp_2 = health_config_add_key_to_values(tmp); + rt->chart_labels = string_strdupz(tmp_2); + freez(tmp); + freez(tmp_2); + } + rt->chart_labels_pattern = simple_pattern_create(rrdcalctemplate_chart_labels(rt), NULL, + SIMPLE_PATTERN_EXACT, true); + } else { error("Health configuration at line %zu of file '%s' for template '%s' has unknown key '%s'.", line, filename, rrdcalctemplate_name(rt), key); diff --git a/health/health_json.c b/health/health_json.c index ba18bddba..4f81998f0 100644 --- a/health/health_json.c +++ b/health/health_json.c @@ -13,136 +13,6 @@ void health_string2json(BUFFER *wb, const char *prefix, const char *label, const buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix); } -void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) { - char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN"); - char config_hash_id[GUID_LEN + 1]; - uuid_unparse_lower(ae->config_hash_id, config_hash_id); - - buffer_sprintf(wb, - "\n\t{\n" - "\t\t\"hostname\": \"%s\",\n" - "\t\t\"utc_offset\": %d,\n" - "\t\t\"timezone\": \"%s\",\n" - "\t\t\"unique_id\": %u,\n" - "\t\t\"alarm_id\": %u,\n" - "\t\t\"alarm_event_id\": %u,\n" - "\t\t\"config_hash_id\": \"%s\",\n" - "\t\t\"name\": \"%s\",\n" - "\t\t\"chart\": \"%s\",\n" - "\t\t\"context\": \"%s\",\n" - "\t\t\"family\": \"%s\",\n" - "\t\t\"class\": \"%s\",\n" - "\t\t\"component\": \"%s\",\n" - "\t\t\"type\": \"%s\",\n" - "\t\t\"processed\": %s,\n" - "\t\t\"updated\": %s,\n" - "\t\t\"exec_run\": %lu,\n" - "\t\t\"exec_failed\": %s,\n" - "\t\t\"exec\": \"%s\",\n" - "\t\t\"recipient\": \"%s\",\n" - "\t\t\"exec_code\": %d,\n" - "\t\t\"source\": \"%s\",\n" - "\t\t\"command\": \"%s\",\n" - "\t\t\"units\": \"%s\",\n" - "\t\t\"when\": %lu,\n" - "\t\t\"duration\": %lu,\n" - "\t\t\"non_clear_duration\": %lu,\n" - "\t\t\"status\": \"%s\",\n" - "\t\t\"old_status\": \"%s\",\n" - "\t\t\"delay\": %d,\n" - "\t\t\"delay_up_to_timestamp\": %lu,\n" - "\t\t\"updated_by_id\": %u,\n" - "\t\t\"updates_id\": %u,\n" - "\t\t\"value_string\": \"%s\",\n" - "\t\t\"old_value_string\": \"%s\",\n" - "\t\t\"last_repeat\": \"%lu\",\n" - "\t\t\"silenced\": \"%s\",\n" - , rrdhost_hostname(host) - , host->utc_offset - , rrdhost_abbrev_timezone(host) - , ae->unique_id - , ae->alarm_id - , ae->alarm_event_id - , config_hash_id - , ae_name(ae) - , ae_chart_name(ae) - , ae_chart_context(ae) - , ae_family(ae) - , ae->classification?ae_classification(ae):"Unknown" - , ae->component?ae_component(ae):"Unknown" - , ae->type?ae_type(ae):"Unknown" - , (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false" - , (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false" - , (unsigned long)ae->exec_run_timestamp - , (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false" - , ae->exec?ae_exec(ae):string2str(host->health.health_default_exec) - , ae->recipient?ae_recipient(ae):string2str(host->health.health_default_recipient) - , ae->exec_code - , ae_source(ae) - , edit_command - , ae_units(ae) - , (unsigned long)ae->when - , (unsigned long)ae->duration - , (unsigned long)ae->non_clear_duration - , rrdcalc_status2string(ae->new_status) - , rrdcalc_status2string(ae->old_status) - , ae->delay - , (unsigned long)ae->delay_up_to_timestamp - , ae->updated_by_id - , ae->updates_id - , ae_new_value_string(ae) - , ae_old_value_string(ae) - , (unsigned long)ae->last_repeat - , (ae->flags & HEALTH_ENTRY_FLAG_SILENCED)?"true":"false" - ); - - health_string2json(wb, "\t\t", "info", ae->info ? ae_info(ae) : "", ",\n"); - - if(unlikely(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)) { - buffer_strcat(wb, "\t\t\"no_clear_notification\": true,\n"); - } - - buffer_strcat(wb, "\t\t\"value\":"); - buffer_print_netdata_double(wb, ae->new_value); - buffer_strcat(wb, ",\n"); - - buffer_strcat(wb, "\t\t\"old_value\":"); - buffer_print_netdata_double(wb, ae->old_value); - buffer_strcat(wb, "\n"); - - buffer_strcat(wb, "\t}"); - - freez(edit_command); -} - -void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart) { - - buffer_strcat(wb, "["); - - unsigned int max = host->health_log.max; - unsigned int count = 0; - - STRING *chart_string = string_strdupz(chart); - - netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); - - ALARM_ENTRY *ae; - for (ae = host->health_log.alarms; ae && count < max; ae = ae->next) { - if ((ae->unique_id > after) && (!chart || chart_string == ae->chart)) { - if (likely(count)) - buffer_strcat(wb, ","); - health_alarm_entry2json_nolock(wb, ae, host); - count++; - } - } - - netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); - - string_freez(chart_string); - - buffer_strcat(wb, "\n]\n"); -} - static inline void health_rrdcalc_values2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC *rc) { (void)host; buffer_sprintf(wb, @@ -397,43 +267,3 @@ void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all) { buffer_strcat(wb, "\n\t}\n}\n"); } -static int have_recent_alarm(RRDHOST *host, uint32_t alarm_id, uint32_t mark) -{ - ALARM_ENTRY *ae = host->health_log.alarms; - - while(ae) { - if (ae->alarm_id == alarm_id && ae->unique_id > mark && - (ae->new_status != RRDCALC_STATUS_WARNING && ae->new_status != RRDCALC_STATUS_CRITICAL)) - return 1; - ae = ae->next; - } - return 0; -} - -void health_active_log_alarms_2json(RRDHOST *host, BUFFER *wb) { - netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); - - buffer_sprintf(wb, "[\n"); - - unsigned int max = host->health_log.max; - unsigned int count = 0; - ALARM_ENTRY *ae; - for(ae = host->health_log.alarms; ae && count < max ; ae = ae->next) { - if (!ae->updated_by_id && - ((ae->new_status == RRDCALC_STATUS_WARNING || ae->new_status == RRDCALC_STATUS_CRITICAL) || - ((ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL) && - ae->new_status == RRDCALC_STATUS_REMOVED))) { - - if (have_recent_alarm(host, ae->alarm_id, ae->unique_id)) - continue; - - if (likely(count)) - buffer_strcat(wb, ","); - health_alarm_entry2json_nolock(wb, ae, host); - count++; - } - } - buffer_strcat(wb, "]"); - - netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); -} diff --git a/health/health_log.c b/health/health_log.c index b1f59a1a5..b62e0ace4 100644 --- a/health/health_log.c +++ b/health/health_log.c @@ -5,14 +5,7 @@ // ---------------------------------------------------------------------------- inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { - sql_health_alarm_log_save(host, ae); - -#ifdef ENABLE_ACLK - if (netdata_cloud_setting) { - sql_queue_alarm_to_aclk(host, ae, 0); - } -#endif } // ---------------------------------------------------------------------------- @@ -53,6 +46,8 @@ inline ALARM_ENTRY* health_create_alarm_entry( uuid_copy(ae->config_hash_id, *((uuid_t *) config_hash_id)); + uuid_generate_random(ae->transition_id); + ae->family = string_dup(family); ae->classification = string_dup(class); ae->component = string_dup(component); |