summaryrefslogtreecommitdiffstats
path: root/src/health
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/health/README.md27
-rw-r--r--src/health/REFERENCE.md48
-rw-r--r--src/health/guides/apcupsd/apcupsd_last_collected_secs.md2
-rw-r--r--src/health/guides/boinc/boinc_total_tasks.md2
-rw-r--r--src/health/guides/boinc/boinc_upload_errors.md2
-rw-r--r--src/health/guides/cockroachdb/cockroachdb_unavailable_ranges.md2
-rw-r--r--src/health/guides/dbengine/10min_dbengine_global_flushing_errors.md2
-rw-r--r--src/health/guides/dbengine/10min_dbengine_global_flushing_warnings.md2
-rw-r--r--src/health/guides/dbengine/10min_dbengine_global_fs_errors.md2
-rw-r--r--src/health/guides/dbengine/10min_dbengine_global_io_errors.md2
-rw-r--r--src/health/guides/entropy/lowest_entropy.md4
-rw-r--r--src/health/guides/exporting/exporting_last_buffering.md2
-rw-r--r--src/health/guides/exporting/exporting_metrics_sent.md2
-rw-r--r--src/health/guides/httpcheck/httpcheck_web_service_bad_content.md2
-rw-r--r--src/health/guides/httpcheck/httpcheck_web_service_bad_status.md2
-rw-r--r--src/health/guides/httpcheck/httpcheck_web_service_slow.md2
-rw-r--r--src/health/guides/httpcheck/httpcheck_web_service_timeouts.md3
-rw-r--r--src/health/guides/httpcheck/httpcheck_web_service_unreachable.md2
-rw-r--r--src/health/guides/ipc/semaphores_used.md3
-rw-r--r--src/health/guides/load/load_average_15.md2
-rw-r--r--src/health/guides/load/load_average_5.md2
-rw-r--r--src/health/guides/mdstat/mdstat_nonredundant_last_collected.md2
-rw-r--r--src/health/guides/ml/ml_1min_node_ar.md14
-rw-r--r--src/health/guides/net/outbound_packets_dropped.md4
-rw-r--r--src/health/guides/portcheck/portcheck_connection_timeouts.md3
-rw-r--r--src/health/guides/postgres/postgres_table_cache_io_ratio.md3
-rw-r--r--src/health/guides/riakkv/riakkv_kv_get_slow.md5
-rw-r--r--src/health/guides/vernemq/vernemq_mqtt_pubcomp_unexpected.md2
-rw-r--r--src/health/guides/vernemq/vernemq_mqtt_pubrec_sent_reason_unsuccessful.md2
-rw-r--r--src/health/guides/vernemq/vernemq_mqtt_pubrel_sent_reason_unsuccessful.md3
-rw-r--r--src/health/guides/vernemq/vernemq_queue_message_expired.md1
-rw-r--r--src/health/guides/vsphere/vsphere_host_mem_usage.md5
-rw-r--r--src/health/guides/web_log/web_log_1m_total_requests.md5
-rw-r--r--src/health/guides/web_log/web_log_1m_unmatched.md12
-rw-r--r--src/health/guides/web_log/web_log_5m_successful.md4
-rw-r--r--src/health/guides/web_log/web_log_web_slow.md9
-rw-r--r--src/health/guides/wifi/wifi_inbound_packets_dropped_ratio.md1
-rw-r--r--src/health/guides/wifi/wifi_outbound_packets_dropped_ratio.md4
-rw-r--r--src/health/guides/windows/windows_10min_cpu_usage.md7
-rw-r--r--src/health/guides/windows/windows_disk_in_use.md3
-rw-r--r--src/health/guides/windows/windows_inbound_packets_discarded.md3
-rw-r--r--src/health/guides/windows/windows_inbound_packets_errors.md3
-rw-r--r--src/health/guides/windows/windows_outbound_packets_discarded.md3
-rw-r--r--src/health/guides/windows/windows_outbound_packets_errors.md7
-rw-r--r--src/health/guides/windows/windows_ram_in_use.md4
-rw-r--r--src/health/guides/windows/windows_swap_in_use.md5
-rw-r--r--src/health/guides/x509check/x509check_days_until_expiration.md3
-rw-r--r--src/health/guides/x509check/x509check_revocation_status.md8
-rw-r--r--src/health/health.d/adaptec_raid.conf33
-rw-r--r--src/health/health.d/bind_rndc.conf12
-rw-r--r--src/health/health.d/clickhouse.conf140
-rw-r--r--src/health/health.d/isc_dhcpd.conf25
-rw-r--r--src/health/health.d/load.conf2
-rw-r--r--src/health/health.d/lvm.conf31
-rw-r--r--src/health/health.d/megacli.conf59
-rw-r--r--src/health/health.d/ping.conf2
-rw-r--r--src/health/health.d/storcli.conf61
-rw-r--r--src/health/health.d/whoisquery.conf8
-rw-r--r--src/health/health.d/x509check.conf8
-rw-r--r--src/health/health.d/zfs.conf46
-rw-r--r--src/health/health.h2
-rw-r--r--src/health/health_config.c50
-rw-r--r--src/health/health_dyncfg.c325
-rw-r--r--src/health/health_event_loop.c19
-rw-r--r--src/health/health_internals.h3
-rw-r--r--src/health/health_log.c2
-rw-r--r--src/health/health_notifications.c4
-rw-r--r--src/health/health_prototypes.c56
-rw-r--r--src/health/health_prototypes.h2
-rw-r--r--src/health/notifications/README.md6
-rw-r--r--src/health/notifications/alerta/README.md4
-rw-r--r--src/health/notifications/awssns/README.md4
-rw-r--r--src/health/notifications/custom/README.md4
-rw-r--r--src/health/notifications/discord/README.md4
-rw-r--r--src/health/notifications/dynatrace/README.md4
-rw-r--r--src/health/notifications/email/README.md4
-rw-r--r--src/health/notifications/flock/README.md4
-rw-r--r--src/health/notifications/gotify/README.md4
-rw-r--r--src/health/notifications/irc/README.md4
-rw-r--r--src/health/notifications/kavenegar/README.md4
-rw-r--r--src/health/notifications/matrix/README.md14
-rw-r--r--src/health/notifications/matrix/metadata.yaml10
-rw-r--r--src/health/notifications/messagebird/README.md4
-rw-r--r--src/health/notifications/msteams/README.md4
-rw-r--r--src/health/notifications/ntfy/README.md4
-rw-r--r--src/health/notifications/opsgenie/README.md4
-rw-r--r--src/health/notifications/pagerduty/README.md4
-rw-r--r--src/health/notifications/prowl/README.md4
-rw-r--r--src/health/notifications/pushbullet/README.md4
-rw-r--r--src/health/notifications/pushover/README.md4
-rw-r--r--src/health/notifications/rocketchat/README.md4
-rw-r--r--src/health/notifications/slack/README.md4
-rw-r--r--src/health/notifications/smstools3/README.md4
-rw-r--r--src/health/notifications/syslog/README.md4
-rw-r--r--src/health/notifications/telegram/README.md4
-rw-r--r--src/health/notifications/twilio/README.md4
-rw-r--r--src/health/rrdcalc.c17
-rw-r--r--src/health/rrdcalc.h4
-rw-r--r--src/health/rrdvar.c2
-rw-r--r--src/health/schema.d/health%3Aalert%3Aprototype.json687
-rw-r--r--src/health/schema.d/health:alert:prototype.json675
101 files changed, 1555 insertions, 1088 deletions
diff --git a/src/health/README.md b/src/health/README.md
index 4b76de9a7..5c479af5f 100644
--- a/src/health/README.md
+++ b/src/health/README.md
@@ -1,14 +1,17 @@
# Alerts and notifications
-The Netdata Agent is a health watchdog for the health and performance of your systems, services, and applications. We've
-worked closely with our community of DevOps engineers, SREs, and developers to define hundreds of production-ready
-alerts that work without any configuration.
-
-The Agent's health monitoring system is also dynamic and fully customizable. You can write entirely new alerts, tune the
-community-configured alerts for every app/service [the Agent collects metrics from](https://github.com/netdata/netdata/blob/master/src/collectors/COLLECTORS.md), or
-silence anything you're not interested in. You can even power complex lookups by running statistical algorithms against
-your metrics.
-
-You can [use various alert notification methods](https://github.com/netdata/netdata/edit/master/docs/monitor/enable-notifications.md),
-[customize alerts](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md), and
-[disable/silence](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#disable-or-silence-alerts) alerts.
+Netdata offers two ways to receive alert notifications on external integrations. These methods work independently, which means you can enable both at the same time to send alert notifications to any number of endpoints.
+
+Both methods use a node's health alerts to generate the content of a notification.
+
+Read our documentation on [configuring alerts](/src/health/REFERENCE.md) to change the preconfigured thresholds or to create tailored alerts for your infrastructure.
+
+- Netdata Cloud provides centralized alert notifications, utilizing the health status data already sent to Netdata Cloud from connected nodes to send alerts to configured integrations. [Supported integrations](/docs/alerts-&-notifications/notifications/centralized-cloud-notifications) include Amazon SNS, Discord, Slack, Splunk, and others.
+
+- The Netdata Agent offers a [wider range of notification options](/docs/alerts-&-notifications/notifications/agent-dispatched-notifications) directly from the agent itself. You can choose from over a dozen services, including email, Slack, PagerDuty, Twilio, and others, for more granular control over notifications on each node.
+
+The Netdata Agent is a health watchdog for the health and performance of your systems, services, and applications. We've worked closely with our community of DevOps engineers, SREs, and developers to define hundreds of production-ready alerts that work without any configuration.
+
+The Agent's health monitoring system is also dynamic and fully customizable. You can write entirely new alerts, tune the pre-configured alerts for every app/service [the Agent collects metrics from](/src/collectors/COLLECTORS.md), or silence anything you're not interested in. You can even power complex lookups by running statistical algorithms against your metrics.
+
+You can [use various alert notification methods](/docs/alerts-and-notifications/notifications/README.md), [customize alerts](/src/health/REFERENCE.md), and [disable/silence](/src/health/REFERENCE.md#disable-or-silence-alerts) alerts.
diff --git a/src/health/REFERENCE.md b/src/health/REFERENCE.md
index 85f1d2281..8b0a9177e 100644
--- a/src/health/REFERENCE.md
+++ b/src/health/REFERENCE.md
@@ -4,13 +4,13 @@ Netdata's health watchdog is highly configurable, with support for dynamic thres
more. You can tweak any of the existing alerts based on your infrastructure's topology or specific monitoring needs, or
create new entities.
-You can use health alerts in conjunction with any of Netdata's [collectors](https://github.com/netdata/netdata/blob/master/src/collectors/README.md) (see
-the [supported collector list](https://github.com/netdata/netdata/blob/master/src/collectors/COLLECTORS.md)) to monitor the health of your systems, containers, and
+You can use health alerts in conjunction with any of Netdata's [collectors](/src/collectors/README.md) (see
+the [supported collector list](/src/collectors/COLLECTORS.md)) to monitor the health of your systems, containers, and
applications in real time.
While you can see active alerts both on the local dashboard and Netdata Cloud, all health alerts are configured _per
node_ via individual Netdata Agents. If you want to deploy a new alert across your
-[infrastructure](https://github.com/netdata/netdata/blob/master/docs/quickstart/infrastructure.md), you must configure each node with the same health configuration
+[infrastructure](/docs/netdata-cloud/organize-your-infrastructure-invite-your-team.md), you must configure each node with the same health configuration
files.
## Reload health configuration
@@ -33,14 +33,14 @@ You can configure the Agent's health watchdog service by editing files in two lo
- The `[health]` section in `netdata.conf`. By editing the daemon's behavior, you can disable health monitoring
altogether, run health checks more or less often, and more. See
- [daemon configuration](https://github.com/netdata/netdata/blob/master/src/daemon/config/README.md#health-section-options) for a table of
+ [daemon configuration](/src/daemon/config/README.md#health-section-options) for a table of
all the available settings, their default values, and what they control.
- The individual `.conf` files in `health.d/`. These health entity files are organized by the type of metric they are
performing calculations on or their associated collector. You should edit these files using the `edit-config`
script. For example: `sudo ./edit-config health.d/cpu.conf`.
-Navigate to your [Netdata config directory](https://github.com/netdata/netdata/blob/master/docs/configure/nodes.md) and
+Navigate to your [Netdata config directory](/docs/netdata-agent/configuration/README.md) and
use `edit-config` to make changes to any of these files.
### Edit individual alerts
@@ -84,7 +84,7 @@ Save the file and [reload Netdata's health configuration](#reload-health-configu
## Disable or silence alerts
Alerts and notifications can be disabled permanently via configuration changes, or temporarily, via the
-[health management API](https://github.com/netdata/netdata/blob/master/src/web/api/health/README.md). The
+[health management API](/src/web/api/health/README.md). The
available options are described below.
### Disable all alerts
@@ -114,7 +114,7 @@ This action requires that you [reload Netdata's health configuration](#reload-he
When you need to frequently disable all or some alerts from triggering during certain times (for instance
when running backups) you can use the
-[health management API](https://github.com/netdata/netdata/blob/master/src/web/api/health/README.md).
+[health management API](/src/web/api/health/README.md).
The API allows you to issue commands to control the health engine's behavior without changing configuration,
or restarting the agent.
@@ -122,7 +122,7 @@ or restarting the agent.
If you want health checks to keep running and alerts to keep getting triggered, but notifications to be
suppressed temporarily, you can use the
-[health management API](https://github.com/netdata/netdata/blob/master/src/web/api/health/README.md).
+[health management API](/src/web/api/health/README.md).
The API allows you to issue commands to control the health engine's behavior without changing configuration,
or restarting the agent.
@@ -134,7 +134,7 @@ your systems, containers, and applications work.
Read the [health entity reference](#health-entity-reference) for a full listing of the format,
syntax, and functionality of health entities.
-To write a new health entity into a new file, navigate to your [Netdata config directory](https://github.com/netdata/netdata/blob/master/docs/configure/nodes.md),
+To write a new health entity into a new file, navigate to your [Netdata config directory](/docs/netdata-agent/configuration/README.md),
then use `touch` to create a new file in the `health.d/` directory. Use `edit-config` to start editing the file.
As an example, let's create a `ram-usage.conf` file.
@@ -223,7 +223,7 @@ Netdata parses the following lines. Beneath the table is an in-depth explanation
- The `every` line is **required** if not using `lookup`.
- Each entity **must** have at least one of the following lines: `lookup`, `calc`, `warn`, or `crit`.
- A few lines use space-separated lists to define how the entity behaves. You can use `*` as a wildcard or prefix with
- `!` for a negative match. Order is important, too! See our [simple patterns docs](https://github.com/netdata/netdata/blob/master/src/libnetdata/simple_pattern/README.md) for
+ `!` for a negative match. Order is important, too! See our [simple patterns docs](/src/libnetdata/simple_pattern/README.md) for
more examples.
- Lines terminated by a `\` are spliced together with the next line. The backslash is removed and the following line is
joined with the current one. No space is inserted, so you may split a line anywhere, even in the middle of a word.
@@ -388,9 +388,9 @@ The format is:
lookup: METHOD(GROUPING OPTIONS) AFTER [at BEFORE] [every DURATION] [OPTIONS] [of DIMENSIONS]
```
-The full [database query API](https://github.com/netdata/netdata/blob/master/src/web/api/queries/README.md) is supported. In short:
+The full [database query API](/src/web/api/queries/README.md) is supported. In short:
-- `METHOD` is one of the available [grouping methods](https://github.com/netdata/netdata/blob/master/src/web/api/queries/README.md#grouping-methods) such as `average`, `min`, `max` etc.
+- `METHOD` is one of the available [grouping methods](/src/web/api/queries/README.md#grouping-methods) such as `average`, `min`, `max` etc.
This is required.
- `GROUPING OPTIONS` are optional and can have the form `CONDITION VALUE`, where `CONDITION` is `!=`, `=`, `<=`, `<`, `>`, `>=` and `VALUE` is a number. The `CONDITION` and `VALUE` are required for `countif`, while `VALUE` is used by `percentile`, `trimmed_mean` and `trimmed_median`.
@@ -416,7 +416,7 @@ The full [database query API](https://github.com/netdata/netdata/blob/master/src
- `average` after time-aggregation of each dimension, return the average of all dimensions.
- `sum` after time-aggregation of each dimension, return the sum of all dimensions (this is the default).
- `min2max` after time-aggregation of each dimension, return the delta between the min and the max of the dimensions.
- - `unligned` prevents shifting the query window to multiples of the query duration.
+ - `unaligned` prevents shifting the query window to multiples of the query duration.
- `match-ids` matches the dimensions based on their IDs (the default is enabled, give `match-names` to disable).
- `match-names` matches the dimension based on their names (the default is enabled, give `match-ids` to disable).
@@ -600,7 +600,7 @@ good idea to tell Netdata to not clear the notification, by using the `no-clear-
#### Alert line `host labels`
-Defines the list of labels present on a host. See our [host labels guide](https://github.com/netdata/netdata/blob/master/docs/guides/using-host-labels.md) for
+Defines the list of labels present on a host. See our [host labels guide](/docs/netdata-agent/configuration/organize-systems-metrics-and-alerts.md) for
an explanation of host labels and how to implement them.
For example, let's suppose that `netdata.conf` is configured with the following labels:
@@ -633,7 +633,7 @@ that will be applied to all hosts installed in the last decade with the followin
host labels: installed = 201*
```
-See our [simple patterns docs](https://github.com/netdata/netdata/blob/master/src/libnetdata/simple_pattern/README.md) for more examples.
+See our [simple patterns docs](/src/libnetdata/simple_pattern/README.md) for more examples.
#### Alert line `chart labels`
@@ -662,7 +662,7 @@ chart labels: mount_point=/mnt/disk1 device=sda
Will create the alert if the `mount_point` is `/mnt/disk1` and the `device` is `sda`. Furthermore, if a chart label name
is specified that does not exist in the chart, the chart won't be matched.
-See our [simple patterns docs](https://github.com/netdata/netdata/blob/master/src/libnetdata/simple_pattern/README.md) for more examples.
+See our [simple patterns docs](/src/libnetdata/simple_pattern/README.md) for more examples.
#### Alert line `summary`
@@ -808,14 +808,14 @@ You can find all the variables that can be used for a given chart, using
Agent dashboard. For example, [variables for the `system.cpu` chart of the
registry](https://registry.my-netdata.io/api/v1/alarm_variables?chart=system.cpu).
-> If you don't know how to find the CHART_NAME, you can read about it [here](https://github.com/netdata/netdata/blob/master/src/web/README.md#charts).
+> If you don't know how to find the CHART_NAME, you can read about it [here](/src/web/README.md#charts).
Netdata supports 3 internal indexes for variables that will be used in health monitoring.
<details><summary>The variables below can be used in both chart alerts and context templates.</summary>
Although the `alarm_variables` link shows you variables for a particular chart, the same variables can also be used in
-templates for charts belonging to a given [context](https://github.com/netdata/netdata/blob/master/src/web/README.md#contexts). The reason is that all charts of a given
+templates for charts belonging to a given [context](/src/web/README.md#contexts). The reason is that all charts of a given
context are essentially identical, with the only difference being the family that identifies a particular hardware or software instance.
</details>
@@ -1048,9 +1048,9 @@ lookup: mean -10s of user
Since [`z = (x - mean) / stddev`](https://en.wikipedia.org/wiki/Standard_score) we create two input alerts, one for `mean` and one for `stddev` and then use them both as inputs in our final `cpu_user_zscore` alert.
-### Example 6 - [Anomaly rate](https://github.com/netdata/netdata/blob/master/src/ml/README.md#anomaly-rate) based CPU chart alert
+### Example 6 - [Anomaly rate](/src/ml/README.md#anomaly-rate) based CPU chart alert
-Warning if 5 minute rolling [anomaly rate](https://github.com/netdata/netdata/blob/master/src/ml/README.md#anomaly-rate) averaged across all CPU dimensions is above 5%, critical if it goes above 20%:
+Warning if 5 minute rolling [anomaly rate](/src/ml/README.md#anomaly-rate) averaged across all CPU dimensions is above 5%, critical if it goes above 20%:
```yaml
template: ml_5min_cpu_chart
@@ -1067,9 +1067,9 @@ template: ml_5min_cpu_chart
The `lookup` line will calculate the average anomaly rate across all `system.cpu` dimensions over the last 5 minues. In this case
Netdata will create one alert for the chart.
-### Example 7 - [Anomaly rate](https://github.com/netdata/netdata/blob/master/src/ml/README.md#anomaly-rate) based node level alert
+### Example 7 - [Anomaly rate](/src/ml/README.md#anomaly-rate) based node level alert
-Warning if 5 minute rolling [anomaly rate](https://github.com/netdata/netdata/blob/master/src/ml/README.md#anomaly-rate) averaged across all ML enabled dimensions is above 5%, critical if it goes above 20%:
+Warning if 5 minute rolling [anomaly rate](/src/ml/README.md#anomaly-rate) averaged across all ML enabled dimensions is above 5%, critical if it goes above 20%:
```yaml
template: ml_5min_node
@@ -1083,11 +1083,11 @@ template: ml_5min_node
info: rolling 5min anomaly rate for all ML enabled dims
```
-The `lookup` line will use the `anomaly_rate` dimension of the `anomaly_detection.anomaly_rate` ML chart to calculate the average [node level anomaly rate](https://github.com/netdata/netdata/blob/master/src/ml/README.md#node-anomaly-rate) over the last 5 minutes.
+The `lookup` line will use the `anomaly_rate` dimension of the `anomaly_detection.anomaly_rate` ML chart to calculate the average [node level anomaly rate](/src/ml/README.md#node-anomaly-rate) over the last 5 minutes.
## Troubleshooting
-You can compile Netdata with [debugging](https://github.com/netdata/netdata/blob/master/src/daemon/README.md#debugging) and then set in `netdata.conf`:
+You can compile Netdata with [debugging](/src/daemon/README.md#debugging) and then set in `netdata.conf`:
```yaml
[global]
diff --git a/src/health/guides/apcupsd/apcupsd_last_collected_secs.md b/src/health/guides/apcupsd/apcupsd_last_collected_secs.md
index 7c8f8035d..fb8d9f9fc 100644
--- a/src/health/guides/apcupsd/apcupsd_last_collected_secs.md
+++ b/src/health/guides/apcupsd/apcupsd_last_collected_secs.md
@@ -42,5 +42,5 @@ This alert is related to your American Power Conversion (APC) uninterruptible po
### Useful resources
-1. [Netdata - APC UPS monitoring](https://learn.netdata.cloud/docs/data-collection/ups/apc-ups)
+1. [Netdata - APC UPS monitoring](/src/collectors/charts.d.plugin/apcupsd/integrations/apc_ups.md)
2. [`apcupsd` - Power management and control software for APC UPS](https://github.com/apcupsd/apcupsd)
diff --git a/src/health/guides/boinc/boinc_total_tasks.md b/src/health/guides/boinc/boinc_total_tasks.md
index c14e15f85..ed7225784 100644
--- a/src/health/guides/boinc/boinc_total_tasks.md
+++ b/src/health/guides/boinc/boinc_total_tasks.md
@@ -24,8 +24,6 @@ This alert monitors the average number of total tasks for the BOINC system over
sudo /etc/init.d/boinc-client restart
```
-2. For other operating systems or custom installations, refer to the BOINC's documentation for restarting the client: https://boinc.berkeley.edu/wiki/Stop_or_restart_BOINC
-
#### Check system resources
BOINC tasks may fail or slow down if there is not enough system resources (CPU, RAM, or Disk Space) available. Monitor your system performance using tools like `top`, `free`, and `df`, and make adjustments if necessary to ensure that BOINC has enough resources to complete tasks.
diff --git a/src/health/guides/boinc/boinc_upload_errors.md b/src/health/guides/boinc/boinc_upload_errors.md
index 80c0ad364..ffd8c78b9 100644
--- a/src/health/guides/boinc/boinc_upload_errors.md
+++ b/src/health/guides/boinc/boinc_upload_errors.md
@@ -18,7 +18,7 @@ This alert indicates that your BOINC node is experiencing an increase in the ave
4. Inspect BOINC client logs
- Consult the BOINC client logs to gain insight into the upload errors. The logs can be found in the client's data directory. Refer to the [BOINC log file documentation](https://boinc.berkeley.edu/wiki/Log_Files) for more information on how to read and analyze the logs.
+ Consult the BOINC client logs to gain insight into the upload errors. The logs can be found in the client's data directory.
5. Contact project support
diff --git a/src/health/guides/cockroachdb/cockroachdb_unavailable_ranges.md b/src/health/guides/cockroachdb/cockroachdb_unavailable_ranges.md
index ef495cb72..4a48f1dba 100644
--- a/src/health/guides/cockroachdb/cockroachdb_unavailable_ranges.md
+++ b/src/health/guides/cockroachdb/cockroachdb_unavailable_ranges.md
@@ -44,7 +44,7 @@ This alert indicates that there are unavailable ranges in your CockroachDB clust
6. Consider rebalancing the cluster
- Rebalancing the cluster can help distribute the load more evenly across nodes and reduce the number of unavailable ranges. See the [CockroachDB documentation](https://www.cockroachlabs.com/docs/stable/training/manual-rebalancing.html) for more information on manual rebalancing.
+ Rebalancing the cluster can help distribute the load more evenly across nodes and reduce the number of unavailable ranges. See the [CockroachDB documentation](https://www.cockroachlabs.com/docs/stable/demo-replication-and-rebalancing.html) for more information on manual rebalancing.
### Useful resources
diff --git a/src/health/guides/dbengine/10min_dbengine_global_flushing_errors.md b/src/health/guides/dbengine/10min_dbengine_global_flushing_errors.md
index 4e388eb28..7548c2d7e 100644
--- a/src/health/guides/dbengine/10min_dbengine_global_flushing_errors.md
+++ b/src/health/guides/dbengine/10min_dbengine_global_flushing_errors.md
@@ -9,5 +9,5 @@ faster disks. This alert is triggered in critical state when the number deleted
### Useful resources
-[Read more about Netdata DB engine](https://learn.netdata.cloud/docs/agent/database/engine)
+[Read more about Netdata DB engine](/src/database/README.md/engine)
diff --git a/src/health/guides/dbengine/10min_dbengine_global_flushing_warnings.md b/src/health/guides/dbengine/10min_dbengine_global_flushing_warnings.md
index 1029e7f60..444796703 100644
--- a/src/health/guides/dbengine/10min_dbengine_global_flushing_warnings.md
+++ b/src/health/guides/dbengine/10min_dbengine_global_flushing_warnings.md
@@ -11,5 +11,5 @@ This alert is triggered in warn state when the number of `dbengine` dirty pages
### Useful resources
-[Read more about Netdata DB engine](https://learn.netdata.cloud/docs/agent/database/engine)
+[Read more about Netdata DB engine](/src/database/README.md/engine)
diff --git a/src/health/guides/dbengine/10min_dbengine_global_fs_errors.md b/src/health/guides/dbengine/10min_dbengine_global_fs_errors.md
index 446289a9c..a4093681b 100644
--- a/src/health/guides/dbengine/10min_dbengine_global_fs_errors.md
+++ b/src/health/guides/dbengine/10min_dbengine_global_fs_errors.md
@@ -10,5 +10,5 @@ This alert is triggered in warning state when the number of filesystem errors is
### Useful resources
-[Read more about Netdata DB engine](https://learn.netdata.cloud/docs/agent/database/engine)
+[Read more about Netdata DB engine](/src/database/README.md/engine)
diff --git a/src/health/guides/dbengine/10min_dbengine_global_io_errors.md b/src/health/guides/dbengine/10min_dbengine_global_io_errors.md
index c47004f40..6bb831669 100644
--- a/src/health/guides/dbengine/10min_dbengine_global_io_errors.md
+++ b/src/health/guides/dbengine/10min_dbengine_global_io_errors.md
@@ -10,5 +10,5 @@ This alert is triggered in critical state when the number of IO errors is greate
### Useful resources
-[Read more about Netdata DB engine](https://learn.netdata.cloud/docs/agent/database/engine)
+[Read more about Netdata DB engine](/src/database/README.md/engine)
diff --git a/src/health/guides/entropy/lowest_entropy.md b/src/health/guides/entropy/lowest_entropy.md
index b53aed2c6..c25dc4d01 100644
--- a/src/health/guides/entropy/lowest_entropy.md
+++ b/src/health/guides/entropy/lowest_entropy.md
@@ -4,7 +4,7 @@ This alert presents the minimum amount of entropy in the kernel entropy pool in
The Netdata Agent checks for the minimum entropy value in the last 5 minutes. The alert gets raised into warning if the value < 100, and cleared if the value > 200.
-For further information on how our alerts are calculated, please have a look at our [Documentation](https://learn.netdata.cloud/docs/agent/health/reference#expressions).
+For further information on how our alerts are calculated, please have a look at our [Documentation](/src/health/REFERENCE.md#expressions).
### What is entropy and why do we need it?
@@ -12,7 +12,7 @@ Entropy is similar to "randomness". A Linux system gathers "real" random numbers
Encryption and cryptography applications require random numbers to operate. A function or an algorithm that produces numbers -*that seem to be random*- is very predictable, if you know what function is used.
-In real life, we use our surroundings and our thoughts to produce truly random numbers. A computer can't really do this by itself, so it gathers numbers from a lot of sources. For example, it can get the CO2 levels in a room from a sensor on the system and use that as a random number.
+In real life, we use our surroundings and our thoughts to produce truly random numbers. A computer can't really do this by itself, so it gathers numbers from a lot of sources. For example, it can get the CO2 levels in a Room from a sensor on the system and use that as a random number.
This way all the values are random and there is no pattern to be found among them.
diff --git a/src/health/guides/exporting/exporting_last_buffering.md b/src/health/guides/exporting/exporting_last_buffering.md
index 4b13fe761..1139b0b6d 100644
--- a/src/health/guides/exporting/exporting_last_buffering.md
+++ b/src/health/guides/exporting/exporting_last_buffering.md
@@ -26,4 +26,4 @@ This alert is related to the Netdata Exporting engine, which calculates the numb
### Useful resources
-1. [Netdata Exporting Engine](https://learn.netdata.cloud/docs/exporting-data-to-other-systems/exporting-reference)
+1. [Netdata Exporting Reference](/src/exporting/README.md)
diff --git a/src/health/guides/exporting/exporting_metrics_sent.md b/src/health/guides/exporting/exporting_metrics_sent.md
index f17f593c4..9896701ce 100644
--- a/src/health/guides/exporting/exporting_metrics_sent.md
+++ b/src/health/guides/exporting/exporting_metrics_sent.md
@@ -43,4 +43,4 @@ To troubleshoot this alert, follow these steps:
### Useful resources
-1. [Netdata Exporting Engine documentation](https://learn.netdata.cloud/docs/exporting-data-to-other-systems/exporting-reference)
+1. [Netdata Exporting Reference](/src/exporting/README.md)
diff --git a/src/health/guides/httpcheck/httpcheck_web_service_bad_content.md b/src/health/guides/httpcheck/httpcheck_web_service_bad_content.md
index 0a5961ca7..433425e09 100644
--- a/src/health/guides/httpcheck/httpcheck_web_service_bad_content.md
+++ b/src/health/guides/httpcheck/httpcheck_web_service_bad_content.md
@@ -27,4 +27,4 @@ sudo ./edit-config go.d/httpcheck.conf
### Useful resources
-1. [HTTP endpoint monitoring with Netdata](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/httpcheck) \ No newline at end of file
+1. [HTTP endpoint monitoring with Netdata](/src/go/collectors/go.d.plugin/modules/httpcheck/integrations/http_endpoints.md) \ No newline at end of file
diff --git a/src/health/guides/httpcheck/httpcheck_web_service_bad_status.md b/src/health/guides/httpcheck/httpcheck_web_service_bad_status.md
index bd9c14341..60fabd751 100644
--- a/src/health/guides/httpcheck/httpcheck_web_service_bad_status.md
+++ b/src/health/guides/httpcheck/httpcheck_web_service_bad_status.md
@@ -18,4 +18,4 @@ root@netdata # curl -v <your_http_endpoint>:<port>/<path>
### Useful resources
-1. [HTTP endpoint monitoring with Netdata](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/httpcheck)
+1. [HTTP endpoint monitoring with Netdata](/src/go/collectors/go.d.plugin/modules/httpcheck/integrations/http_endpoints.md)
diff --git a/src/health/guides/httpcheck/httpcheck_web_service_slow.md b/src/health/guides/httpcheck/httpcheck_web_service_slow.md
index aad2cc8da..4f962e155 100644
--- a/src/health/guides/httpcheck/httpcheck_web_service_slow.md
+++ b/src/health/guides/httpcheck/httpcheck_web_service_slow.md
@@ -14,5 +14,5 @@ To troubleshoot this issue, check for:
### Useful resources
-1. [HTTP endpoint monitoring with Netdata](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/httpcheck)
+1. [HTTP endpoint monitoring with Netdata](/src/go/collectors/go.d.plugin/modules/httpcheck/integrations/http_endpoints.md)
diff --git a/src/health/guides/httpcheck/httpcheck_web_service_timeouts.md b/src/health/guides/httpcheck/httpcheck_web_service_timeouts.md
index 03e300d1d..3e4c4debf 100644
--- a/src/health/guides/httpcheck/httpcheck_web_service_timeouts.md
+++ b/src/health/guides/httpcheck/httpcheck_web_service_timeouts.md
@@ -15,7 +15,7 @@ An HTTP request timeout occurs when a client (such as a web browser) sends a req
- Verify the issue
-Check the HTTP endpoint to see if it is responsive and reachable. You can use tools like `curl` or online services like [https://www.isitdownrightnow.com/](https://www.isitdownrightnow.com/) to check the availability of the website or service.
+Check the HTTP endpoint to see if it is responsive and reachable. You can use tools like `curl` or online services like <https://www.isitdownrightnow.com/> to check the availability of the website or service.
- Analyze server logs
@@ -36,4 +36,3 @@ Make sure your web server configurations are optimized for performance. For inst
- Verify network configurations
Examine the network configurations for potential issues that can lead to HTTP request timeouts. Check for misconfigured firewalls or faulty load balancers that may be interfering with traffic to the HTTP endpoint.
-
diff --git a/src/health/guides/httpcheck/httpcheck_web_service_unreachable.md b/src/health/guides/httpcheck/httpcheck_web_service_unreachable.md
index bb6f51bf5..c77d33c0b 100644
--- a/src/health/guides/httpcheck/httpcheck_web_service_unreachable.md
+++ b/src/health/guides/httpcheck/httpcheck_web_service_unreachable.md
@@ -30,4 +30,4 @@ To troubleshoot this error, check the following:
### Useful resources
-1. [HTTP endpoint monitoring with Netdata](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/httpcheck) \ No newline at end of file
+1. [HTTP endpoint monitoring with Netdata](/src/go/collectors/go.d.plugin/modules/httpcheck/integrations/http_endpoints.md) \ No newline at end of file
diff --git a/src/health/guides/ipc/semaphores_used.md b/src/health/guides/ipc/semaphores_used.md
index 145ef0ad4..e58d1a60e 100644
--- a/src/health/guides/ipc/semaphores_used.md
+++ b/src/health/guides/ipc/semaphores_used.md
@@ -44,5 +44,4 @@ This alert monitors the percentage of allocated `System V IPC semaphores`. If yo
### Useful resources
1. [Interprocess Communication](https://docs.oracle.com/cd/E19455-01/806-4750/6jdqdfltn/index.html)
-2. [IPC: Semaphores](https://users.cs.cf.ac.uk/Dave.Marshall/C/node26.html)
-3. [Linux Kernel Documentation - IPC Semaphores](https://www.kernel.org/doc/Documentation/ipc/semaphore.txt) \ No newline at end of file
+2. [IPC: Semaphores](https://users.cs.cf.ac.uk/Dave.Marshall/C/node26.html) \ No newline at end of file
diff --git a/src/health/guides/load/load_average_15.md b/src/health/guides/load/load_average_15.md
index ba8b1e3e0..37df648a5 100644
--- a/src/health/guides/load/load_average_15.md
+++ b/src/health/guides/load/load_average_15.md
@@ -4,7 +4,7 @@ This alarm calculates the system `load average` (CPU and I/O demand) over the pe
The alert gets raised into warning if the metric is 2 times the expected value and cleared if the value is 1.75 times the expected value.
-For further information on how our alerts are calculated, please have a look at our [Documentation](https://learn.netdata.cloud/docs/agent/health/reference#expressions).
+For further information on how our alerts are calculated, please have a look at our [Documentation](/src/health/REFERENCE.md#expressions).
### What does "load average" mean?
diff --git a/src/health/guides/load/load_average_5.md b/src/health/guides/load/load_average_5.md
index 6eacfcec9..d284eb963 100644
--- a/src/health/guides/load/load_average_5.md
+++ b/src/health/guides/load/load_average_5.md
@@ -4,7 +4,7 @@ This alarm calculates the system `load average` (CPU and I/O demand) over the pe
The alert gets raised into warning if the metric is 4 times the expected value and cleared if the value is 3.5 times the expected value.
-For further information on how our alerts are calculated, please have a look at our [Documentation](https://learn.netdata.cloud/docs/agent/health/reference#expressions).
+For further information on how our alerts are calculated, please have a look at our [Documentation](/src/health/REFERENCE.md#expressions).
### What does "load average" mean?
diff --git a/src/health/guides/mdstat/mdstat_nonredundant_last_collected.md b/src/health/guides/mdstat/mdstat_nonredundant_last_collected.md
index f76c61483..2cc9574a1 100644
--- a/src/health/guides/mdstat/mdstat_nonredundant_last_collected.md
+++ b/src/health/guides/mdstat/mdstat_nonredundant_last_collected.md
@@ -52,4 +52,4 @@ The md (multiple device) driver is responsible for managing software RAID arrays
### Useful resources
1. [Linux RAID: A Quick Guide](https://www.cyberciti.biz/tips/linux-raid-increase-resync-rebuild-speed.html)
-2. [Netdata Agent Configuration Guide](https://learn.netdata.cloud/docs/agent/daemon/config)
+2. [Netdata Agent Configuration Guide](/src/daemon/config/README.md)
diff --git a/src/health/guides/ml/ml_1min_node_ar.md b/src/health/guides/ml/ml_1min_node_ar.md
index 203d991ac..b5f12389b 100644
--- a/src/health/guides/ml/ml_1min_node_ar.md
+++ b/src/health/guides/ml/ml_1min_node_ar.md
@@ -1,8 +1,8 @@
### Understand the alert
-This alert is triggered when the [node anomaly rate](https://learn.netdata.cloud/docs/ml-and-troubleshooting/machine-learning-ml-powered-anomaly-detection#node-anomaly-rate) exceeds the threshold defined in the [alert configuration](https://github.com/netdata/netdata/blob/master/src/health/health.d/ml.conf) over the most recent 1 minute window evaluated.
+This alert is triggered when the [node anomaly rate](/src/ml/README.md) exceeds the threshold defined in the [alert configuration](https://github.com/netdata/netdata/blob/master/src/health/health.d/ml.conf) over the most recent 1 minute window evaluated.
-For example, with the default of `warn: $this > 1`, this means that 1% or more of the metrics collected on the node have across the most recent 1 minute window been flagged as [anomalous](https://learn.netdata.cloud/docs/ml-and-troubleshooting/machine-learning-ml-powered-anomaly-detection) by Netdata.
+For example, with the default of `warn: $this > 1`, this means that 1% or more of the metrics collected on the node have across the most recent 1 minute window been flagged as [anomalous](/src/ml/README.md) by Netdata.
### Troubleshoot the alert
@@ -12,15 +12,15 @@ This alert is a signal that some significant percentage of metrics within your i
2. **Highlight the area of interest**: Highlight the timeframne of interest where you see an elevated anomaly rate.
-3. **Check the anomalies tab**: Check the [Anomaly Advisor](https://learn.netdata.cloud/docs/ml-and-troubleshooting/anomaly-advisor) ("Anomalies" tab) to see an ordered list of what metrics were most anomalous in the highlighted window.
+3. **Check the anomalies tab**: Check the [Anomaly Advisor tab](/docs/dashboards-and-charts/anomaly-advisor-tab.md) to see an ordered list of what metrics were most anomalous in the highlighted window.
4. **Press the AR% button on Overview**: You can also press the "[AR%](https://blog.netdata.cloud/anomaly-rates-in-the-menu/)" button on the Overview or single node dashboard to see what parts of the menu have the highest chart anomaly rates. Pressing the AR% button should add some "pills" to each menu item and if you hover over it you will see that chart within each menu section that was most anomalous during the highlighted timeframe.
-5. **Use Metric Correlations**: Use [metric correlations](https://learn.netdata.cloud/docs/ml-and-troubleshooting/metric-correlations) to see what metrics may have changed most significantly comparing before to the highlighted timeframe.
+5. **Use Metric Correlations**: Use [metric correlations](/docs/metric-correlations.md) to see what metrics may have changed most significantly comparing before to the highlighted timeframe.
### Useful resources
-1. [Machine learning (ML) powered anomaly detection](https://learn.netdata.cloud/docs/ml-and-troubleshooting/machine-learning-ml-powered-anomaly-detection)
-2. [Anomaly Advisor](https://learn.netdata.cloud/docs/ml-and-troubleshooting/anomaly-advisor)
-3. [Metric Correlations](https://learn.netdata.cloud/docs/ml-and-troubleshooting/metric-correlations)
+1. [Machine learning (ML) powered anomaly detection](/src/ml/README.md)
+2. [Anomaly Advisor tab](/docs/dashboards-and-charts/anomaly-advisor-tab.md)
+3. [Metric Correlations](/docs/metric-correlations.md)
4. [Anomaly Rates in the Menu!](https://blog.netdata.cloud/anomaly-rates-in-the-menu/)
diff --git a/src/health/guides/net/outbound_packets_dropped.md b/src/health/guides/net/outbound_packets_dropped.md
index 49291d1d9..f943c3fd8 100644
--- a/src/health/guides/net/outbound_packets_dropped.md
+++ b/src/health/guides/net/outbound_packets_dropped.md
@@ -14,7 +14,7 @@ Check the alert message for the `${label:device}` placeholder. It indicates the
2. Verify network congestion or excessive traffic:
-Excessive traffic or network congestion can lead to dropped packets. To check network traffic, use the `nload` tool. If it isn't installed, you can follow the instructions given [here](https://www.howtoforge.com/tutorial/install-nload-on-linux/).
+Excessive traffic or network congestion can lead to dropped packets. To check network traffic, use the `nload` tool.
```bash
nload ${label:device}
@@ -22,7 +22,7 @@ nload ${label:device}
This will display the current network bandwidth usage on the specified interface. Look for unusually high or fluctuating usage patterns, which could indicate congestion or excessive traffic.
-3. Verify hardware issues:
+1. Verify hardware issues:
Check the network interface and related hardware components (such as the network card, cables, and switches) for visible damage, loose connections, or other issues. Replace any defective components as needed.
diff --git a/src/health/guides/portcheck/portcheck_connection_timeouts.md b/src/health/guides/portcheck/portcheck_connection_timeouts.md
index 5386f1509..b3608f62e 100644
--- a/src/health/guides/portcheck/portcheck_connection_timeouts.md
+++ b/src/health/guides/portcheck/portcheck_connection_timeouts.md
@@ -37,5 +37,4 @@ This alert triggers a warning state when the ratio of timeouts is between 10-40%
### Useful resources
1. [Netstat Command in Linux](https://www.tecmint.com/20-netstat-commands-for-linux-network-management/)
-2. [Iostat Command Usage and Examples](https://www.thomas-krenn.com/en/wiki/Iostat_command_usage_and_examples)
-3. [Iftop Guide](https://www.tecmint.com/iftop-linux-network-bandwidth-monitoring-tool/)
+2. [Iftop Guide](https://www.tecmint.com/iftop-linux-network-bandwidth-monitoring-tool/)
diff --git a/src/health/guides/postgres/postgres_table_cache_io_ratio.md b/src/health/guides/postgres/postgres_table_cache_io_ratio.md
index 382f8ee4d..712f4aafc 100644
--- a/src/health/guides/postgres/postgres_table_cache_io_ratio.md
+++ b/src/health/guides/postgres/postgres_table_cache_io_ratio.md
@@ -28,5 +28,4 @@ Keep monitoring cache hit ratios after making changes to your configuration or o
### Useful resources
-1. [Tuning Your PostgreSQL Server](https://www.postgresql.org/docs/current/runtime-config-resource.html)
-2. [Performance Monitoring and Tuning in PostgreSQL](https://learn.netdata.cloud/docs/agent/collectors/python.d.plugin/postgres#monitoring)
+1. [Tuning Your PostgreSQL Server](https://www.postgresql.org/docs/current/runtime-config-resource.html) \ No newline at end of file
diff --git a/src/health/guides/riakkv/riakkv_kv_get_slow.md b/src/health/guides/riakkv/riakkv_kv_get_slow.md
index 05fd67ce7..888c96e72 100644
--- a/src/health/guides/riakkv/riakkv_kv_get_slow.md
+++ b/src/health/guides/riakkv/riakkv_kv_get_slow.md
@@ -17,6 +17,5 @@ The `riakkv_kv_get_slow` alert is related to Riak KV, a distributed NoSQL key-va
### Useful resources
1. [Riak KV documentation](https://riak.com/documentation/)
-2. [Monitoring Riak KV with Netdata](https://learn.netdata.cloud/docs/agent/collectors/python.d.plugin/riakkv/)
-3. [Riak Control: Monitoring and Administration Interface](https://docs.riak.com/riak/kv/2.2.3/configuring/reference/riak-vars/#riak-control)
-4. [Riak KV Monitoring and Metrics](https://docs.riak.com/riak/kv/2.2.3/using/performance/monitoring/index.html)
+2. [Riak Control: Monitoring and Administration Interface](https://docs.riak.com/riak/kv/2.2.3/configuring/reference/riak-vars/#riak-control)
+3. [Riak KV Monitoring and Metrics](https://docs.riak.com/riak/kv/2.2.3/using/performance/monitoring/index.html)
diff --git a/src/health/guides/vernemq/vernemq_mqtt_pubcomp_unexpected.md b/src/health/guides/vernemq/vernemq_mqtt_pubcomp_unexpected.md
index ab4932177..b6fb32d6c 100644
--- a/src/health/guides/vernemq/vernemq_mqtt_pubcomp_unexpected.md
+++ b/src/health/guides/vernemq/vernemq_mqtt_pubcomp_unexpected.md
@@ -25,5 +25,5 @@ In MQTT, the PUBCOMP packet is used when QoS (Quality of Service) 2 is applied.
### Useful resources
-1. [VerneMQ Documentation](https://vernemq.com/documentation/)
+1. [VerneMQ Documentation](https://docs.vernemq.com/)
2. [MQTT Specification - MQTT Control Packets](https://docs.oasis-open.org/mqtt/mqtt/v5.0/os/mqtt-v5.0-os.html#_Toc3901046)
diff --git a/src/health/guides/vernemq/vernemq_mqtt_pubrec_sent_reason_unsuccessful.md b/src/health/guides/vernemq/vernemq_mqtt_pubrec_sent_reason_unsuccessful.md
index 9b1976494..2a7a0ca5b 100644
--- a/src/health/guides/vernemq/vernemq_mqtt_pubrec_sent_reason_unsuccessful.md
+++ b/src/health/guides/vernemq/vernemq_mqtt_pubrec_sent_reason_unsuccessful.md
@@ -25,6 +25,6 @@ In the MQTT protocol, when a client sends a PUBLISH message with Quality of Serv
### Useful resources
-1. [VerneMQ Documentation](https://vernemq.com/documentation.html)
+1. [VerneMQ Documentation](https://docs.vernemq.com/)
2. [MQTT Essentials – All Core MQTT Concepts explained](https://www.hivemq.com/mqtt-essentials/)
3. [Understanding QoS Levels in MQTT](https://www.hivemq.com/blog/mqtt-essentials-part-6-mqtt-quality-of-service-levels/) \ No newline at end of file
diff --git a/src/health/guides/vernemq/vernemq_mqtt_pubrel_sent_reason_unsuccessful.md b/src/health/guides/vernemq/vernemq_mqtt_pubrel_sent_reason_unsuccessful.md
index 18e85e12a..85bc661a5 100644
--- a/src/health/guides/vernemq/vernemq_mqtt_pubrel_sent_reason_unsuccessful.md
+++ b/src/health/guides/vernemq/vernemq_mqtt_pubrel_sent_reason_unsuccessful.md
@@ -45,5 +45,4 @@ This alert is related to VerneMQ, a high-performance MQTT broker. It monitors th
### Useful resources
1. [VerneMQ - Official Documentation](https://docs.vernemq.com/)
-2. [MQTT Essentials: Quality of Service 2 (QoS 2)](https://www.hivemq.com/blog/mqtt-essentials-part-6-mqtt-quality-of-service-levels/)
-3. [Netdata - VerneMQ monitoring](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/vernemq) \ No newline at end of file
+2. [MQTT Essentials: Quality of Service 2 (QoS 2)](https://www.hivemq.com/blog/mqtt-essentials-part-6-mqtt-quality-of-service-levels/) \ No newline at end of file
diff --git a/src/health/guides/vernemq/vernemq_queue_message_expired.md b/src/health/guides/vernemq/vernemq_queue_message_expired.md
index bd0533402..85a8688b8 100644
--- a/src/health/guides/vernemq/vernemq_queue_message_expired.md
+++ b/src/health/guides/vernemq/vernemq_queue_message_expired.md
@@ -50,4 +50,3 @@ Expired messages are removed from the queue and are not delivered to subscribers
### Useful resources
1. [VerneMQ Documentation](https://vernemq.com/docs/)
-2. [How to Monitor VerneMQ MQTT broker with Netdata](https://learn.netdata.cloud/guides/monitor/vernemq.html)
diff --git a/src/health/guides/vsphere/vsphere_host_mem_usage.md b/src/health/guides/vsphere/vsphere_host_mem_usage.md
index 458e403a2..991a76f85 100644
--- a/src/health/guides/vsphere/vsphere_host_mem_usage.md
+++ b/src/health/guides/vsphere/vsphere_host_mem_usage.md
@@ -28,6 +28,5 @@ The `vsphere_host_mem_usage` alert is triggered when the memory utilization of a
### Useful resources
-1. [Understanding Memory Utilization in VMware vSphere Host](https://www.altaro.com/vmware/memory-utilization-vmware-esxi/)
-2. [vSphere Monitoring and Performance Documentation](https://docs.vmware.com/en/VMware-vSphere/7.0/com.vmware.vsphere.monitoring.doc/GUID-115861E6-810A-43BB-8CDB-EE99CF8F3250.html)
-3. [Optimizing Memory Performance in VMware vSphere](https://blogs.vmware.com/performance/2021/04/optimizing-memory-performance-in-vmware-vsphere.html) \ No newline at end of file
+1. [vSphere Monitoring and Performance Documentation](https://docs.vmware.com/en/VMware-vSphere/7.0/com.vmware.vsphere.monitoring.doc/GUID-115861E6-810A-43BB-8CDB-EE99CF8F3250.html)
+2. [Optimizing Memory Performance in VMware vSphere](https://blogs.vmware.com/performance/2021/04/optimizing-memory-performance-in-vmware-vsphere.html) \ No newline at end of file
diff --git a/src/health/guides/web_log/web_log_1m_total_requests.md b/src/health/guides/web_log/web_log_1m_total_requests.md
index c867cfbf6..7dc19983d 100644
--- a/src/health/guides/web_log/web_log_1m_total_requests.md
+++ b/src/health/guides/web_log/web_log_1m_total_requests.md
@@ -30,7 +30,4 @@ An increase in workload means that your web server is handling more traffic than
### Useful resources
-1. [Analyzing Web server logs with ApacheTop](https://www.howtoforge.com/how-to-analyze-apache-web-server-logs-apachetop)
-2. [Logstash Guide: Analyzing Logs](https://www.elastic.co/guide/en/logstash/current/logstash-intro.html)
-3. [Web Application Performance Monitoring with New Relic](https://newrelic.com/platform/web-application-monitoring)
-4. [Vertically or Horizontally Scaling Your Web Server](https://www.digitalocean.com/community/tutorials/5-common-server-setups-for-your-web-application) \ No newline at end of file
+1. [Vertically or Horizontally Scaling Your Web Server](https://www.digitalocean.com/community/tutorials/5-common-server-setups-for-your-web-application) \ No newline at end of file
diff --git a/src/health/guides/web_log/web_log_1m_unmatched.md b/src/health/guides/web_log/web_log_1m_unmatched.md
index 2025c3fbb..b95fa28bf 100644
--- a/src/health/guides/web_log/web_log_1m_unmatched.md
+++ b/src/health/guides/web_log/web_log_1m_unmatched.md
@@ -12,14 +12,4 @@ Web servers like NGINX and Apache2 give you the ability to modify the log patter
You must create a new job in the `web_log` collector for your Agent.
-1. See how you can [configure this collector](https://learn.netdata.cloud/docs/agent/collectors/python.d.plugin/web_log#configuration)
-
-2. Follow the job template specified in the [default web_log.conf file](https://github.com/netdata/netdata/blob/master/src/collectors/python.d.plugin/web_log/web_log.conf#L53-L86), focus on the lines [83:85](https://github.com/netdata/netdata/blob/e6d9fbc4a53f1d35363e9b342231bb11627bafbd/collectors/python.d.plugin/web_log/web_log.conf#L83-L85) where you can see how you define a `custom_log_format`.
-
-3. Restart the Netdata Agent
- ```
- systemctl restart netdata
- ```
-
-
-
+1. See how you can [configure this collector](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/weblog#configuration)
diff --git a/src/health/guides/web_log/web_log_5m_successful.md b/src/health/guides/web_log/web_log_5m_successful.md
index 5c5b2c4e6..d3ca5916a 100644
--- a/src/health/guides/web_log/web_log_5m_successful.md
+++ b/src/health/guides/web_log/web_log_5m_successful.md
@@ -31,6 +31,4 @@ A successful HTTP request is one that receives a response with an HTTP status co
### Useful resources
1. [Apache Log Files](https://httpd.apache.org/docs/current/logs.html)
-2. [Nginx Log Files](https://nginx.org/en/docs/ngx_core_module.html#error_log)
-3. [Introduction to Identifying Security Vulnerabilities in Web Servers](https://www.acunetix.com/blog/articles/introduction-identifying-security-vulnerabilities-web-servers)
-4. [Web Application Performance Analysis and Monitoring](https://www.site24x7.com/learning/web-application-performance.html) \ No newline at end of file
+2. [Nginx Log Files](https://nginx.org/en/docs/ngx_core_module.html#error_log) \ No newline at end of file
diff --git a/src/health/guides/web_log/web_log_web_slow.md b/src/health/guides/web_log/web_log_web_slow.md
index 7ed3ebe1f..917d0325c 100644
--- a/src/health/guides/web_log/web_log_web_slow.md
+++ b/src/health/guides/web_log/web_log_web_slow.md
@@ -41,8 +41,7 @@ There are several factors that can cause slow web server performance. To trouble
### Useful resources
-1. [Optimizing NGINX for Performance](https://easyengine.io/tutorials/nginx/performance/)
-2. [Apache Performance Tuning](https://httpd.apache.org/docs/2.4/misc/perf-tuning.html)
-3. [Top 10 MySQL Performance Tuning Tips](https://www.databasejournal.com/features/mysql/top-10-mysql-performance-tuning-tips.html)
-4. [10 Tips for Optimal PostgreSQL Performance](https://www.digitalocean.com/community/tutorials/10-tips-for-optimizing-postgresql-performance-on-a-digitalocean-droplet)
-5. [A Beginner's Guide to HTTP Cache Headers](https://www.keycdn.com/blog/http-cache-headers) \ No newline at end of file
+1. [Apache Performance Tuning](https://httpd.apache.org/docs/2.4/misc/perf-tuning.html)
+2. [Top 10 MySQL Performance Tuning Tips](https://www.databasejournal.com/features/mysql/top-10-mysql-performance-tuning-tips.html)
+3. [10 Tips for Optimal PostgreSQL Performance](https://www.digitalocean.com/community/tutorials/10-tips-for-optimizing-postgresql-performance-on-a-digitalocean-droplet)
+4. [A Beginner's Guide to HTTP Cache Headers](https://www.keycdn.com/blog/http-cache-headers)
diff --git a/src/health/guides/wifi/wifi_inbound_packets_dropped_ratio.md b/src/health/guides/wifi/wifi_inbound_packets_dropped_ratio.md
index ce26c1e5e..0ba4a1fea 100644
--- a/src/health/guides/wifi/wifi_inbound_packets_dropped_ratio.md
+++ b/src/health/guides/wifi/wifi_inbound_packets_dropped_ratio.md
@@ -47,6 +47,5 @@ sudo tcpdump -i <interface_name>
1. [Top 20 Netstat Command Examples in Linux](https://www.tecmint.com/20-netstat-commands-for-linux-network-management/)
2. [iftop command in Linux to monitor network traffic](https://www.tecmint.com/iftop-linux-network-bandwidth-monitoring-tool/)
-3. [An Overview of Packet Sniffing using Tcpdump](https://www.ubuntupit.com/tcpdump-useful-unix-packet-sniffer-command/)
Remember to replace `<interface_name>` with the actual name of the WiFi network interface causing the alert. \ No newline at end of file
diff --git a/src/health/guides/wifi/wifi_outbound_packets_dropped_ratio.md b/src/health/guides/wifi/wifi_outbound_packets_dropped_ratio.md
index 8441885df..5223fc073 100644
--- a/src/health/guides/wifi/wifi_outbound_packets_dropped_ratio.md
+++ b/src/health/guides/wifi/wifi_outbound_packets_dropped_ratio.md
@@ -49,6 +49,4 @@ You can adjust network settings, like buffers or queues, to mitigate dropped pac
### Useful resources
1. [ifconfig command in Linux](https://www.geeksforgeeks.org/ifconfig-command-in-linux-with-examples/)
-2. [How to monitor network usage with iftop](https://www.binarytides.com/monitor-network-usage-with-iftop/)
-3. [nload – Monitor Network Traffic and Bandwidth Usage in Real Time](https://www.tecmint.com/nload-monitor-linux-network-traffic-bandwidth-usage/)
-4. [VNstat – A Network Traffic Monitor](https://www.tecmint.com/vnstat-monitor-network-traffic-in-linux/) \ No newline at end of file
+2. [nload – Monitor Network Traffic and Bandwidth Usage in Real Time](https://www.tecmint.com/nload-monitor-linux-network-traffic-bandwidth-usage/) \ No newline at end of file
diff --git a/src/health/guides/windows/windows_10min_cpu_usage.md b/src/health/guides/windows/windows_10min_cpu_usage.md
index 5b585c714..70edb16d9 100644
--- a/src/health/guides/windows/windows_10min_cpu_usage.md
+++ b/src/health/guides/windows/windows_10min_cpu_usage.md
@@ -30,7 +30,6 @@ This alert calculates the average total `CPU utilization` on a Windows system ov
### Useful resources
-1. [How to Monitor CPU Usage on Windows](https://www.tomsguide.com/how-to/how-to-monitor-cpu-usage-on-windows)
-2. [Windows Task Manager: A Troubleshooting Guide](https://www.howtogeek.com/66622/stupid-geek-tricks-6-ways-to-open-windows-task-manager/)
-3. [How to Use the Performance Monitor on Windows](https://www.digitalcitizen.life/how-use-performance-monitor-windows/)
-4. [Understanding Process Explorer](https://docs.microsoft.com/en-us/sysinternals/downloads/process-explorer) \ No newline at end of file
+1. [Windows Task Manager: A Troubleshooting Guide](https://www.howtogeek.com/66622/stupid-geek-tricks-6-ways-to-open-windows-task-manager/)
+2. [How to Use the Performance Monitor on Windows](https://www.digitalcitizen.life/how-use-performance-monitor-windows/)
+3. [Understanding Process Explorer](https://docs.microsoft.com/en-us/sysinternals/downloads/process-explorer) \ No newline at end of file
diff --git a/src/health/guides/windows/windows_disk_in_use.md b/src/health/guides/windows/windows_disk_in_use.md
index 4642b79ce..1830a2b2c 100644
--- a/src/health/guides/windows/windows_disk_in_use.md
+++ b/src/health/guides/windows/windows_disk_in_use.md
@@ -30,5 +30,4 @@ Disk space utilization is crucial for the stable and efficient operation of your
### Useful resources
1. [Windows 10 Tips & Tricks: Analyze Disk Space & Free Space - YouTube](https://www.youtube.com/watch?v=NolLC9tBP_Y)
-2. [5 Free Tools to Visualize Disk Space Usage on Windows](https://www.hongkiat.com/blog/visualize-hard-disk-usage-free-tools-for-windows/)
-3. [10 Ways to Free Up Hard Drive Space on Windows](https://www.howtogeek.com/125923/7-ways-to-free-up-hard-disk-space-on-windows/) \ No newline at end of file
+2. [10 Ways to Free Up Hard Drive Space on Windows](https://www.howtogeek.com/125923/7-ways-to-free-up-hard-disk-space-on-windows/) \ No newline at end of file
diff --git a/src/health/guides/windows/windows_inbound_packets_discarded.md b/src/health/guides/windows/windows_inbound_packets_discarded.md
index 829e34ffe..039cf9e55 100644
--- a/src/health/guides/windows/windows_inbound_packets_discarded.md
+++ b/src/health/guides/windows/windows_inbound_packets_discarded.md
@@ -35,5 +35,4 @@ Packet corruption can be caused by faulty hardware, software issues, or even cyb
### Useful resources
1. [Windows Performance Monitor](https://docs.microsoft.com/en-us/windows-server/administration/windows-commands/perfmon)
-2. [Windows Event Viewer](https://docs.microsoft.com/en-us/windows/win32/eventlog/event-log-reference)
-3. [How to troubleshoot networking problems on the Windows platform](https://support.microsoft.com/en-us/help/10267) \ No newline at end of file
+2. [Windows Event Viewer](https://docs.microsoft.com/en-us/windows/win32/eventlog/event-log-reference) \ No newline at end of file
diff --git a/src/health/guides/windows/windows_inbound_packets_errors.md b/src/health/guides/windows/windows_inbound_packets_errors.md
index aee982d6a..be1a2869f 100644
--- a/src/health/guides/windows/windows_inbound_packets_errors.md
+++ b/src/health/guides/windows/windows_inbound_packets_errors.md
@@ -37,5 +37,4 @@ To troubleshoot this alert, you can perform the following steps:
### Useful resources
1. [How to use Network Monitor in Windows](https://docs.microsoft.com/en-us/windows/client-management/troubleshoot-tcpip-network-monitor)
-2. [Network Troubleshooting Guide for Windows](https://techcommunity.microsoft.com/t5/networking-blog/network-troubleshooting-guide-for-windows/ba-p/428114)
-3. [How to Troubleshoot Network Connections with Ping and Tracert](https://www.windowscentral.com/how-troubleshoot-network-connection-ping-and-traceroute) \ No newline at end of file
+2. [Network Troubleshooting Guide for Windows](https://techcommunity.microsoft.com/t5/networking-blog/network-troubleshooting-guide-for-windows/ba-p/428114) \ No newline at end of file
diff --git a/src/health/guides/windows/windows_outbound_packets_discarded.md b/src/health/guides/windows/windows_outbound_packets_discarded.md
index 226c3b0ba..1cd5f922f 100644
--- a/src/health/guides/windows/windows_outbound_packets_discarded.md
+++ b/src/health/guides/windows/windows_outbound_packets_discarded.md
@@ -44,5 +44,4 @@ If your network is congested, it can cause an increase in discarded packets. Con
### Useful resources
1. [Using Performance Monitor to monitor network performance](https://techcommunity.microsoft.com/t5/ask-the-performance-team/using-perfmon-to-monitor-your-servers-network-performance/ba-p/373944)
-2. [Monitoring Network Performance with Resource Monitor](https://www.online-tech-tips.com/computer-tips/monitoring-network-performance-with-resource-monitor/)
-3. [Event Viewer in Windows](https://www.dummies.com/computers/operating-systems/windows-10/how-to-use-event-viewer-in-windows-10/) \ No newline at end of file
+2. [Event Viewer in Windows](https://www.dummies.com/computers/operating-systems/windows-10/how-to-use-event-viewer-in-windows-10/) \ No newline at end of file
diff --git a/src/health/guides/windows/windows_outbound_packets_errors.md b/src/health/guides/windows/windows_outbound_packets_errors.md
index 2ccb8ef16..7fcfb65f2 100644
--- a/src/health/guides/windows/windows_outbound_packets_errors.md
+++ b/src/health/guides/windows/windows_outbound_packets_errors.md
@@ -40,7 +40,6 @@ Ensure that your network interface card (NIC) drivers and firmware are up-to-dat
### Useful resources
-1. [Netstat Command Usage on Windows](https://www.computerhope.com/issues/ch001/stat.htm)
-2. [Wireshark - A Network Protocol Analyzer](https://www.wireshark.org/)
-3. [Tcpdump - A Packet Analyzer](https://www.tcpdump.org/)
-4. [Network Performance Monitoring and Diagnostics Guide](https://docs.microsoft.com/en-us/windows-server/networking/technologies/npmd/npmd) \ No newline at end of file
+1. [Wireshark - A Network Protocol Analyzer](https://www.wireshark.org/)
+2. [Tcpdump - A Packet Analyzer](https://www.tcpdump.org/)
+3. [Network Performance Monitoring and Diagnostics Guide](https://docs.microsoft.com/en-us/windows-server/networking/technologies/npmd/npmd) \ No newline at end of file
diff --git a/src/health/guides/windows/windows_ram_in_use.md b/src/health/guides/windows/windows_ram_in_use.md
index ef85588b0..f51a92eda 100644
--- a/src/health/guides/windows/windows_ram_in_use.md
+++ b/src/health/guides/windows/windows_ram_in_use.md
@@ -33,6 +33,4 @@ Memory utilization refers to the percentage of a system's RAM that is currently
### Useful resources
-1. [How to use Task Manager to monitor Windows PC's performance](https://support.microsoft.com/en-us/windows/how-to-use-task-manager-to-monitor-windows-pc-s-performance-171100cb-5e7d-aaba-29abfedfb06f)
-2. [How to use Performance Monitor on Windows 10](https://www.windowscentral.com/how-use-performance-monitor-windows-10)
-3. [How to fix high memory usage in Windows](https://pureinfotech.com/reduce-ram-memory-usage-windows/) \ No newline at end of file
+1. [How to use Performance Monitor on Windows 10](https://www.windowscentral.com/how-use-performance-monitor-windows-10) \ No newline at end of file
diff --git a/src/health/guides/windows/windows_swap_in_use.md b/src/health/guides/windows/windows_swap_in_use.md
index 5a6500915..38fa06548 100644
--- a/src/health/guides/windows/windows_swap_in_use.md
+++ b/src/health/guides/windows/windows_swap_in_use.md
@@ -36,6 +36,5 @@ Swap memory is a virtual memory management technique where a portion of the disk
### Useful resources
-1. [How to Manage Virtual Memory (Pagefile) in Windows 10](https://www.techbout.com/manage-virtual-memory-pagefile-windows-10-29638/)
-2. [Troubleshooting Windows Performance Issues Using the Resource Monitor](https://docs.microsoft.com/en-us/archive/blogs/askcore/troubleshooting-windows-performance-issues-using-the-resource-monitor)
-3. [Windows Performance Monitor](https://docs.microsoft.com/en-us/windows-server/administration/windows-server-2008-help/troubleshoot/windows-rel-performance-monitor) \ No newline at end of file
+1. [Troubleshooting Windows Performance Issues Using the Resource Monitor](https://docs.microsoft.com/en-us/archive/blogs/askcore/troubleshooting-windows-performance-issues-using-the-resource-monitor)
+2. [Windows Performance Monitor](https://docs.microsoft.com/en-us/windows-server/administration/windows-server-2008-help/troubleshoot/windows-rel-performance-monitor) \ No newline at end of file
diff --git a/src/health/guides/x509check/x509check_days_until_expiration.md b/src/health/guides/x509check/x509check_days_until_expiration.md
index a37792ab0..18314de54 100644
--- a/src/health/guides/x509check/x509check_days_until_expiration.md
+++ b/src/health/guides/x509check/x509check_days_until_expiration.md
@@ -41,5 +41,4 @@ If there are still issues or the alert persists, double-check your certificate m
### Useful resources
1. [Sectigo: What is an X.509 certificate?](https://sectigo.com/resource-library/what-is-x509-certificate)
-2. [Netdata: x509 certificate monitoring](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/x509check)
-3. [OpenSSL: X.509 Certificate Commands](https://www.openssl.org/docs/man1.1.1/man1/x509.html) \ No newline at end of file
+2. [OpenSSL: X.509 Certificate Commands](https://www.openssl.org/docs/man1.1.1/man1/x509.html) \ No newline at end of file
diff --git a/src/health/guides/x509check/x509check_revocation_status.md b/src/health/guides/x509check/x509check_revocation_status.md
index 2d14f1062..fc48deefe 100644
--- a/src/health/guides/x509check/x509check_revocation_status.md
+++ b/src/health/guides/x509check/x509check_revocation_status.md
@@ -28,8 +28,6 @@ This alert indicates that the X.509 certificate has been revoked, meaning that i
### Useful resources
-1. [X.509 Certificate Monitoring with Netdata](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/x509check)
-2. [How to use OpenSSL to verify a certificate against a CRL](https://raymii.org/s/tutorials/OpenSSL_command_line_Root_and_Intermediate_CA_including_OCSP_CRL_Signed_Certs.html)
-3. [SSL Shopper's SSL Checker](https://www.sslshopper.com/ssl-checker.html)
-4. [Renewing certificates with Certbot](https://certbot.eff.org/docs/using.html#renewing-certificates)
-5. [Creating a Self-Signed SSL Certificate](https://www.akadia.com/services/ssh_test_certificate.html) \ No newline at end of file
+1. [SSL Shopper's SSL Checker](https://www.sslshopper.com/ssl-checker.html)
+2. [Renewing certificates with Certbot](https://certbot.eff.org/docs/using.html#renewing-certificates)
+3. [Creating a Self-Signed SSL Certificate](https://www.akadia.com/services/ssh_test_certificate.html) \ No newline at end of file
diff --git a/src/health/health.d/adaptec_raid.conf b/src/health/health.d/adaptec_raid.conf
index b6f265db3..b01113b69 100644
--- a/src/health/health.d/adaptec_raid.conf
+++ b/src/health/health.d/adaptec_raid.conf
@@ -1,32 +1,29 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
-# logical device status check
-
- template: adaptec_raid_ld_status
- on: adaptec_raid.ld_status
+ template: adaptec_raid_ld_health_status
+ on: adaptecraid.logical_device_status
class: Errors
type: System
component: RAID
- lookup: max -10s
- units: bool
+ lookup: average -1m unaligned percentage of ok
+ units: %
every: 10s
- crit: $this > 0
+ crit: $this < 100
delay: down 5m multiplier 1.5 max 1h
- summary: Adaptec raid logical device status
- info: Logical device status is failed or degraded
+ summary: Adaptec RAID LD (number ${label:ld_number}) health status
+ info: Adaptec RAID logical device (number ${label:ld_number} name ${label:ld_name}) health status is critical
to: sysadmin
-# physical device state check
-
- template: adaptec_raid_pd_state
- on: adaptec_raid.pd_state
+ template: adaptec_raid_pd_health_state
+ on: adaptecraid.physical_device_state
class: Errors
type: System
component: RAID
- lookup: max -10s
- units: bool
+ lookup: average -1m unaligned percentage of ok
+ units: %
every: 10s
- crit: $this > 0
+ crit: $this < 100
delay: down 5m multiplier 1.5 max 1h
- summary: Adaptec raid physical device state
- info: Physical device state is not online
+ summary: Adaptec RAID PD (number ${label:pd_number}) health state
+ info: Adaptec RAID physical device (number ${label:pd_number} location ${label:location}) health state is critical
to: sysadmin
diff --git a/src/health/health.d/bind_rndc.conf b/src/health/health.d/bind_rndc.conf
deleted file mode 100644
index b1c271df9..000000000
--- a/src/health/health.d/bind_rndc.conf
+++ /dev/null
@@ -1,12 +0,0 @@
- template: bind_rndc_stats_file_size
- on: bind_rndc.stats_size
- class: Utilization
- type: DNS
-component: BIND
- units: megabytes
- every: 60
- calc: $stats_size
- warn: $this > 512
- summary: BIND statistics file size
- info: BIND statistics-file size
- to: sysadmin
diff --git a/src/health/health.d/clickhouse.conf b/src/health/health.d/clickhouse.conf
new file mode 100644
index 000000000..e24f71830
--- /dev/null
+++ b/src/health/health.d/clickhouse.conf
@@ -0,0 +1,140 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: clickhouse_restarted
+ on: clickhouse.uptime
+ class: Error
+ type: Database
+component: ClickHouse
+ calc: $uptime
+ units: seconds
+ every: 10s
+ warn: $this > 1 AND $this < 180
+ summary: ClickHouse restart detected
+ info: ClickHouse has recently been restarted
+ to: silent
+
+ template: clickhouse_queries_preempted
+ on: clickhouse.queries_preempted
+ class: Workload
+ type: Database
+component: ClickHouse
+ lookup: max -1m unaligned
+ units: preempted_queries
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 1h
+ summary: ClickHouse preempted queries detected
+ info: ClickHouse has queries that are stopped and waiting due to priority setting
+ to: dba
+
+ template: clickhouse_long_running_query
+ on: clickhouse.longest_running_query_time
+ class: Latency
+ type: Database
+component: ClickHouse
+ lookup: max -1m unaligned
+ units: seconds
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (300) : (600))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: ClickHouse long-running query detected
+ info: ClickHouse has a long-running query exceeding the threshold
+ to: dba
+
+ template: clickhouse_rejected_inserts
+ on: clickhouse.rejected_inserts
+ class: Workload
+ type: Database
+component: ClickHouse
+ lookup: sum -1m unaligned
+ units: rejected_inserts
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 1h
+ summary: ClickHouse rejected INSERT queries detected
+ info: ClickHouse has INSERT queries that are rejected due to high number of active data parts for partition in a MergeTree
+ to: dba
+
+ template: clickhouse_delayed_inserts
+ on: clickhouse.delayed_inserts
+ class: Workload
+ type: Database
+component: ClickHouse
+ lookup: sum -1m unaligned
+ units: delayed_inserts
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 1h
+ summary: ClickHouse delayed INSERT queries detected
+ info: ClickHouse has INSERT queries that are throttled due to high number of active data parts for partition in a MergeTree
+ to: silent
+
+ template: clickhouse_replication_lag
+ on: clickhouse.replicas_max_absolute_delay
+ class: Workload
+ type: Database
+component: ClickHouse
+ lookup: avg -1m unaligned
+ units: seconds
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (250) : (300))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: ClickHouse high replication lag detected
+ info: ClickHouse is experiencing replication lag greater than 5 minutes
+ to: dba
+
+ template: clickhouse_replicated_readonly_tables
+ on: clickhouse.replicated_readonly_tables
+ class: Error
+ type: Database
+component: ClickHouse
+ lookup: max -1m unaligned
+ units: readonly_tables
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 1h
+ summary: ClickHouse replicated tables in readonly state detected
+ info: ClickHouse has replicated tables in readonly state due to ZooKeeper session loss/startup without ZooKeeper configured
+ to: dba
+
+ template: clickhouse_max_part_count_for_partition
+ on: clickhouse.max_part_count_for_partition
+ class: Workload
+ type: Database
+component: ClickHouse
+ lookup: avg -1m unaligned
+ units: parts
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (200) : (300))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: ClickHouse high parts/partition detected
+ info: ClickHouse high number of parts per partition
+ to: dba
+
+ template: clickhouse_distributed_connections_failures
+ on: clickhouse.distributed_connections_fail_exhausted_retries
+ class: Error
+ type: Database
+component: ClickHouse
+ lookup: sum -1m unaligned
+ units: failures
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 1h
+ summary: ClickHouse distributed connections failures detected
+ info: ClickHouse has failed distributed connections after exhausting all retry attempts
+ to: dba
+
+ template: clickhouse_distributed_files_to_insert
+ on: clickhouse.distributed_files_to_insert
+ class: Workload
+ type: Database
+component: ClickHouse
+ lookup: max -1m unaligned
+ units: files
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (40) : (80))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: ClickHouse high files to insert detected
+ info: ClickHouse high number of pending files to process for asynchronous insertion into Distributed tables
+ to: silent
diff --git a/src/health/health.d/isc_dhcpd.conf b/src/health/health.d/isc_dhcpd.conf
index d1f93969a..3f6e9d5d4 100644
--- a/src/health/health.d/isc_dhcpd.conf
+++ b/src/health/health.d/isc_dhcpd.conf
@@ -1,10 +1,15 @@
-# template: isc_dhcpd_leases_size
-# on: isc_dhcpd.leases_total
-# units: KB
-# every: 60
-# calc: $leases_size
-# warn: $this > 3072
-# crit: $this > 6144
-# delay: up 2m down 5m
-# info: dhcpd.leases file too big! Module can slow down your server.
-# to: sysadmin
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: isc_dhcpd_dhcp_pool_utilization
+ on: isc_dhcpd.dhcp_pool_utilization
+ class: Utilization
+ type: DHCP
+component: DHCPd
+ every: 10s
+ units: %
+ calc: $used
+ warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
+ delay: down 5m
+ summary: ISC DHCP pool ${label:dhcp_pool_name} utilization
+ info: ISC DHCP pool ${label:dhcp_pool_name} utilization
+ to: sysadmin
diff --git a/src/health/health.d/load.conf b/src/health/health.d/load.conf
index 7b0e18b84..e639c9ad5 100644
--- a/src/health/health.d/load.conf
+++ b/src/health/health.d/load.conf
@@ -30,7 +30,7 @@ host labels: _os=linux
every: 1m
warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200)
delay: down 15m multiplier 1.5 max 1h
- summary: Host load average (15 minutes)
+ summary: System load average (15 minutes)
info: System load average for the past 15 minutes
to: silent
diff --git a/src/health/health.d/lvm.conf b/src/health/health.d/lvm.conf
new file mode 100644
index 000000000..570aa14d3
--- /dev/null
+++ b/src/health/health.d/lvm.conf
@@ -0,0 +1,31 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: lvm_lv_data_space_utilization
+ on: lvm.lv_data_space_utilization
+ class: Utilization
+ type: System
+ component: LVM
+ calc: $utilization
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING ) ? (85) : (90))
+ crit: ($this > (($status == $CRITICAL) ? (90) : (98)))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: LVM LV ${label:lv_name} on VG ${label:vg_name} high data space usage
+ info: LVM logical volume high data space usage (LV ${label:lv_name} VG ${label:vg_name} Type ${label:volume_type})
+ to: sysadmin
+
+ template: lvm_lv_metadata_space_utilization
+ on: lvm.lv_metadata_space_utilization
+ class: Utilization
+ type: System
+ component: LVM
+ calc: $utilization
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING ) ? (85) : (90))
+ crit: ($this > (($status == $CRITICAL) ? (90) : (98)))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: LVM LV ${label:lv_name} on VG ${label:vg_name} high metadata space usage
+ info: LVM logical volume high metadata space usage (LV ${label:lv_name} VG ${label:vg_name} Type ${label:volume_type})
+ to: sysadmin
diff --git a/src/health/health.d/megacli.conf b/src/health/health.d/megacli.conf
index d1e2e7acf..27721fa9a 100644
--- a/src/health/health.d/megacli.conf
+++ b/src/health/health.d/megacli.conf
@@ -1,54 +1,55 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
-## Adapters (controllers)
+# Adapters (controllers)
- template: megacli_adapter_state
- on: megacli.adapter_degraded
+ template: megacli_adapter_health_state
+ on: megacli.adapter_health_state
class: Errors
type: System
component: RAID
- lookup: max -10s
- units: boolean
+ lookup: average -1m unaligned percentage of optimal
+ units: %
every: 10s
- crit: $this > 0
+ crit: $this < 100
delay: down 5m multiplier 2 max 10m
- summary: MegaCLI adapter state
- info: Adapter is in the degraded state (0: false, 1: true)
+ summary: MegaCLI adapter ${label:adapter_number} health
+ info: MegaCLI adapter ${label:adapter_number} is in the degraded state
to: sysadmin
-## Physical Disks
-
- template: megacli_pd_predictive_failures
- on: megacli.pd_predictive_failure
+ template: megacli_phys_drive_media_errors
+ on: megacli.phys_drive_media_errors
class: Errors
type: System
component: RAID
lookup: sum -10s
- units: predictive failures
+ units: media errors
every: 10s
warn: $this > 0
delay: up 1m down 5m multiplier 2 max 10m
- summary: MegaCLI physical drive predictive failures
- info: Number of physical drive predictive failures
+ summary: MegaCLI PD adapter ${label:adapter_number} slot ${label:slot_number} media errors
+ info: MegaCLI physical drive adapter ${label:adapter_number} slot ${label:slot_number} media errors
to: sysadmin
- template: megacli_pd_media_errors
- on: megacli.pd_media_error
+# Physical Drives
+
+ template: megacli_phys_drive_predictive_failures
+ on: megacli.phys_drive_predictive_failures
class: Errors
type: System
component: RAID
lookup: sum -10s
- units: media errors
+ units: failures
every: 10s
warn: $this > 0
delay: up 1m down 5m multiplier 2 max 10m
- summary: MegaCLI physical drive errors
- info: Number of physical drive media errors
+ summary: MegaCLI PD adapter ${label:adapter_number} slot ${label:slot_number} predictive failures
+ info: MegaCLI physical drive (adapter ${label:adapter_number} slot ${label:slot_number}) predictive failures
to: sysadmin
-## Battery Backup Units (BBU)
+# Backup Battery Unit
- template: megacli_bbu_relative_charge
- on: megacli.bbu_relative_charge
+ template: megacli_bbu_charge
+ on: megacli.bbu_charge
class: Workload
type: System
component: RAID
@@ -57,12 +58,12 @@ component: RAID
every: 10s
warn: $this <= (($status >= $WARNING) ? (85) : (80))
crit: $this <= (($status == $CRITICAL) ? (50) : (40))
- summary: MegaCLI BBU charge state
- info: Average battery backup unit (BBU) relative state of charge over the last 10 seconds
+ summary: MegaCLI BBU charge
+ info: MegaCLI Backup Battery Unit (adapter ${label:adapter_number}) average charge over the last minute
to: sysadmin
- template: megacli_bbu_cycle_count
- on: megacli.bbu_cycle_count
+ template: megacli_bbu_recharge_cycles
+ on: megacli.bbu_recharge_cycles
class: Workload
type: System
component: RAID
@@ -71,6 +72,6 @@ component: RAID
every: 10s
warn: $this >= 100
crit: $this >= 500
- summary: MegaCLI BBU cycles count
- info: Average battery backup unit (BBU) charge cycles count over the last 10 seconds
+ summary: MegaCLI BBU recharge cycles
+ info: MegaCLI Backup Battery Unit (adapter ${label:adapter_number}) recharge cycles
to: sysadmin
diff --git a/src/health/health.d/ping.conf b/src/health/health.d/ping.conf
index 0e434420d..a91b231c3 100644
--- a/src/health/health.d/ping.conf
+++ b/src/health/health.d/ping.conf
@@ -6,7 +6,7 @@
type: Other
component: Network
lookup: average -30s unaligned of loss
- calc: $this != nan AND $this < 100
+ calc: ($this == nan) ? (nan) : ($this < 100)
units: up/down
every: 10s
crit: $this == 0
diff --git a/src/health/health.d/storcli.conf b/src/health/health.d/storcli.conf
new file mode 100644
index 000000000..be71b517e
--- /dev/null
+++ b/src/health/health.d/storcli.conf
@@ -0,0 +1,61 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# Controllers
+
+ template: storcli_controller_health_status
+ on: storcli.controller_health_status
+ class: Errors
+ type: System
+component: RAID
+ lookup: average -1m unaligned percentage of healthy
+ units: %
+ every: 10s
+ crit: $this < 100
+ delay: down 5m multiplier 2 max 10m
+ summary: RAID controller ${label:controller_number} health
+ info: RAID controller ${label:controller_number} is unhealthy
+ to: sysadmin
+
+ template: storcli_controller_bbu_status
+ on: storcli.controller_bbu_status
+ class: Errors
+ type: System
+component: RAID
+ lookup: average -1m unaligned percentage of healthy,na
+ units: %
+ every: 10s
+ crit: $this < 100
+ delay: down 5m multiplier 2 max 10m
+ summary: RAID controller ${label:controller_number} BBU health
+ info: RAID controller ${label:controller_number} BBU is unhealthy
+ to: sysadmin
+
+# Physical Drives
+
+ template: storcli_phys_drive_errors
+ on: storcli.phys_drive_errors
+ class: Errors
+ type: System
+component: RAID
+ lookup: sum -10s
+ units: errors
+ every: 10s
+ warn: $this > 0
+ delay: up 1m down 5m multiplier 2 max 10m
+ summary: RAID PD c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} errors
+ info: RAID physical drive c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} errors
+ to: sysadmin
+
+ template: storcli_phys_drive_predictive_failures
+ on: storcli.phys_drive_predictive_failures
+ class: Errors
+ type: System
+component: RAID
+ lookup: sum -10s
+ units: failures
+ every: 10s
+ warn: $this > 0
+ delay: up 1m down 5m multiplier 2 max 10m
+ summary: RAID PD c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} predictive failures
+ info: RAID physical drive c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} predictive failures
+ to: sysadmin
diff --git a/src/health/health.d/whoisquery.conf b/src/health/health.d/whoisquery.conf
index 0a328b592..6d87ad280 100644
--- a/src/health/health.d/whoisquery.conf
+++ b/src/health/health.d/whoisquery.conf
@@ -4,11 +4,11 @@
class: Utilization
type: Other
component: WHOIS
- calc: $expiry
- units: seconds
+ calc: $expiry / 86400
+ units: days
every: 60s
- warn: $this < $days_until_expiration_warning*24*60*60
- crit: $this < $days_until_expiration_critical*24*60*60
+ warn: $this < $days_until_expiration_warning
+ crit: $this < $days_until_expiration_critical
summary: Whois expiration time for domain ${label:domain}
info: Time until the domain name registration for ${label:domain} expires
to: webmaster
diff --git a/src/health/health.d/x509check.conf b/src/health/health.d/x509check.conf
index d05f3ef0f..1d40c8602 100644
--- a/src/health/health.d/x509check.conf
+++ b/src/health/health.d/x509check.conf
@@ -4,11 +4,11 @@
class: Latency
type: Certificates
component: x509 certificates
- calc: $expiry
- units: seconds
+ calc: $expiry / 86400
+ units: days
every: 60s
- warn: $this < $days_until_expiration_warning*24*60*60
- crit: $this < $days_until_expiration_critical*24*60*60
+ warn: $this < $days_until_expiration_warning
+ crit: $this < $days_until_expiration_critical
summary: x509 certificate expiration for ${label:source}
info: Time until x509 certificate expires for ${label:source}
to: webmaster
diff --git a/src/health/health.d/zfs.conf b/src/health/health.d/zfs.conf
index d2a561000..9c1f0018b 100644
--- a/src/health/health.d/zfs.conf
+++ b/src/health/health.d/zfs.conf
@@ -42,3 +42,49 @@ component: File system
summary: Critical ZFS pool ${label:pool} state
info: ZFS pool ${label:pool} state is faulted or unavail
to: sysadmin
+
+
+## go.d/zfspool
+
+ template: zfs_pool_space_utilization
+ on: zfspool.pool_space_utilization
+ class: Utilization
+ type: System
+component: File system
+ calc: $utilization
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING ) ? (85) : (90))
+ crit: $this > (($status >= $WARNING ) ? (90) : (98))
+ delay: down 1m multiplier 1.5 max 1h
+ summary: ZFS pool ${label:pool} space utilization
+ info: ZFS pool ${label:pool} is nearing capacity. Current space usage is above the threshold.
+ to: sysadmin
+
+ template: zfs_pool_health_state_warn
+ on: zfspool.pool_health_state
+ class: Errors
+ type: System
+component: File system
+ calc: $degraded
+ units: boolean
+ every: 10s
+ warn: $this > 0
+ delay: down 1m multiplier 1.5 max 1h
+ summary: ZFS pool ${label:pool} state
+ info: ZFS pool ${label:pool} state is degraded
+ to: sysadmin
+
+ template: zfs_pool_health_state_crit
+ on: zfspool.pool_health_state
+ class: Errors
+ type: System
+component: File system
+ calc: $faulted + $unavail
+ units: boolean
+ every: 10s
+ crit: $this > 0
+ delay: down 1m multiplier 1.5 max 1h
+ summary: Critical ZFS pool ${label:pool} state
+ info: ZFS pool ${label:pool} state is faulted or unavail
+ to: sysadmin
diff --git a/src/health/health.h b/src/health/health.h
index 8aca6dcb2..b1ac5a9e1 100644
--- a/src/health/health.h
+++ b/src/health/health.h
@@ -89,7 +89,7 @@ void health_string2json(BUFFER *wb, const char *prefix, const char *label, const
void health_log_alert_transition_with_trace(RRDHOST *host, ALARM_ENTRY *ae, int line, const char *file, const char *function);
#define health_log_alert(host, ae) health_log_alert_transition_with_trace(host, ae, __LINE__, __FILE__, __FUNCTION__)
-bool health_alarm_log_get_global_id_and_transition_id_for_rrdcalc(RRDCALC *rc, usec_t *global_id, uuid_t *transitions_id);
+bool health_alarm_log_get_global_id_and_transition_id_for_rrdcalc(RRDCALC *rc, usec_t *global_id, nd_uuid_t *transitions_id);
int alert_variable_lookup_trace(RRDHOST *host, RRDSET *st, const char *variable, BUFFER *wb);
diff --git a/src/health/health_config.c b/src/health/health_config.c
index d8c735c3f..c17f7e21d 100644
--- a/src/health/health_config.c
+++ b/src/health/health_config.c
@@ -19,14 +19,14 @@ static inline int health_parse_delay(
while(*s) {
char *key = s;
- while(*s && !isspace(*s)) s++;
- while(*s && isspace(*s)) *s++ = '\0';
+ while(*s && !isspace((uint8_t)*s)) s++;
+ while(*s && isspace((uint8_t)*s)) *s++ = '\0';
if(!*key) break;
char *value = s;
- while(*s && !isspace(*s)) s++;
- while(*s && isspace(*s)) *s++ = '\0';
+ while(*s && !isspace((uint8_t)*s)) s++;
+ while(*s && isspace((uint8_t)*s)) *s++ = '\0';
if(!strcasecmp(key, "up")) {
if (!config_parse_duration(value, delay_up_duration)) {
@@ -91,12 +91,12 @@ static inline ALERT_ACTION_OPTIONS health_parse_options(const char *s) {
buf[0] = '\0';
// skip spaces
- while(*s && isspace(*s))
+ while(*s && isspace((uint8_t)*s))
s++;
// find the next space
size_t count = 0;
- while(*s && count < 100 && !isspace(*s))
+ while(*s && count < 100 && !isspace((uint8_t)*s))
buf[count++] = *s++;
if(buf[0]) {
@@ -124,14 +124,14 @@ static inline int health_parse_repeat(
while(*s) {
char *key = s;
- while(*s && !isspace(*s)) s++;
- while(*s && isspace(*s)) *s++ = '\0';
+ while(*s && !isspace((uint8_t)*s)) s++;
+ while(*s && isspace((uint8_t)*s)) *s++ = '\0';
if(!*key) break;
char *value = s;
- while(*s && !isspace(*s)) s++;
- while(*s && isspace(*s)) *s++ = '\0';
+ while(*s && !isspace((uint8_t)*s)) s++;
+ while(*s && isspace((uint8_t)*s)) *s++ = '\0';
if(!strcasecmp(key, "off")) {
*warn_repeat_every = 0;
@@ -176,8 +176,8 @@ static inline int health_parse_db_lookup(size_t line, const char *filename, char
// first is the group method
key = s;
- while(*s && !isspace(*s) && *s != '(') s++;
- while(*s && isspace(*s)) *s++ = '\0';
+ while(*s && !isspace((uint8_t)*s) && *s != '(') s++;
+ while(*s && isspace((uint8_t)*s)) *s++ = '\0';
if(!*s) {
netdata_log_error("Health configuration invalid chart calculation at line %zu of file '%s': expected group method followed by the 'after' time, but got '%s'",
line, filename, key);
@@ -224,12 +224,12 @@ static inline int health_parse_db_lookup(size_t line, const char *filename, char
ac->time_group_condition = ALERT_LOOKUP_TIME_GROUP_CONDITION_LESS;
}
- while(*s && isspace(*s)) s++;
+ while(*s && isspace((uint8_t)*s)) s++;
if(*s) {
- if(isdigit(*s) || *s == '.') {
+ if(isdigit((uint8_t)*s) || *s == '.') {
ac->time_group_value = str2ndd(s, &s);
- while(s && *s && isspace(*s)) s++;
+ while(s && *s && isspace((uint8_t)*s)) s++;
if(!s || *s != ')') {
netdata_log_error("Health configuration at line %zu of file '%s': missing closing parenthesis after number in aggregation method on '%s'",
@@ -270,8 +270,8 @@ static inline int health_parse_db_lookup(size_t line, const char *filename, char
// then is the 'after' time
key = s;
- while(*s && !isspace(*s)) s++;
- while(*s && isspace(*s)) *s++ = '\0';
+ while(*s && !isspace((uint8_t)*s)) s++;
+ while(*s && isspace((uint8_t)*s)) *s++ = '\0';
if(!config_parse_duration(key, &ac->after)) {
netdata_log_error("Health configuration at line %zu of file '%s': invalid duration '%s' after group method",
@@ -285,14 +285,14 @@ static inline int health_parse_db_lookup(size_t line, const char *filename, char
// now we may have optional parameters
while(*s) {
key = s;
- while(*s && !isspace(*s)) s++;
- while(*s && isspace(*s)) *s++ = '\0';
+ while(*s && !isspace((uint8_t)*s)) s++;
+ while(*s && isspace((uint8_t)*s)) *s++ = '\0';
if(!*key) break;
if(!strcasecmp(key, "at")) {
char *value = s;
- while(*s && !isspace(*s)) s++;
- while(*s && isspace(*s)) *s++ = '\0';
+ while(*s && !isspace((uint8_t)*s)) s++;
+ while(*s && isspace((uint8_t)*s)) *s++ = '\0';
if (!config_parse_duration(value, &ac->before)) {
netdata_log_error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword",
@@ -301,8 +301,8 @@ static inline int health_parse_db_lookup(size_t line, const char *filename, char
}
else if(!strcasecmp(key, HEALTH_EVERY_KEY)) {
char *value = s;
- while(*s && !isspace(*s)) s++;
- while(*s && isspace(*s)) *s++ = '\0';
+ while(*s && !isspace((uint8_t)*s)) s++;
+ while(*s && isspace((uint8_t)*s)) *s++ = '\0';
if (!config_parse_duration(value, &ac->update_every)) {
netdata_log_error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword",
@@ -651,7 +651,7 @@ int health_readfile(const char *filename, void *data __maybe_unused, bool stock_
lookup_data_source_from_rrdr_options(ap);
dims_grouping_from_rrdr_options(ap);
replace_green_red(ap, green, red);
- health_prototype_add(ap);
+ health_prototype_add(ap, NULL);
freez(ap);
}
@@ -833,7 +833,7 @@ int health_readfile(const char *filename, void *data __maybe_unused, bool stock_
lookup_data_source_from_rrdr_options(ap);
dims_grouping_from_rrdr_options(ap);
replace_green_red(ap, green, red);
- health_prototype_add(ap);
+ health_prototype_add(ap, NULL);
freez(ap);
}
diff --git a/src/health/health_dyncfg.c b/src/health/health_dyncfg.c
index 933f03818..f2b9bc607 100644
--- a/src/health/health_dyncfg.c
+++ b/src/health/health_dyncfg.c
@@ -53,112 +53,114 @@ static void data_source_to_rrdr_options(RRD_ALERT_PROTOTYPE *ap) {
}
}
-static bool parse_match(json_object *jobj, const char *path, struct rrd_alert_match *match, BUFFER *error) {
+static bool parse_match(json_object *jobj, const char *path, struct rrd_alert_match *match, BUFFER *error, bool strict) {
STRING *on = NULL;
- JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "on", on, error, true);
+ JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "on", on, error, strict);
if(match->is_template)
match->on.context = on;
else
match->on.chart = on;
- JSONC_PARSE_TXT2PATTERN_OR_ERROR_AND_RETURN(jobj, path, "host_labels", match->host_labels, error);
- JSONC_PARSE_TXT2PATTERN_OR_ERROR_AND_RETURN(jobj, path, "instance_labels", match->chart_labels, error);
+ JSONC_PARSE_TXT2PATTERN_OR_ERROR_AND_RETURN(jobj, path, "host_labels", match->host_labels, error, strict);
+ JSONC_PARSE_TXT2PATTERN_OR_ERROR_AND_RETURN(jobj, path, "instance_labels", match->chart_labels, error, strict);
return true;
}
-static bool parse_config_value_database_lookup(json_object *jobj, const char *path, struct rrd_alert_config *config, BUFFER *error) {
- JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "after", config->after, error);
- JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "before", config->before, error);
- JSONC_PARSE_TXT2ENUM_OR_ERROR_AND_RETURN(jobj, path, "time_group", time_grouping_txt2id, config->time_group, error);
- JSONC_PARSE_TXT2ENUM_OR_ERROR_AND_RETURN(jobj, path, "dims_group", alerts_dims_grouping2id, config->dims_group, error);
- JSONC_PARSE_TXT2ENUM_OR_ERROR_AND_RETURN(jobj, path, "data_source", alerts_data_sources2id, config->data_source, error);
+static bool parse_config_value_database_lookup(json_object *jobj, const char *path, struct rrd_alert_config *config, BUFFER *error, bool strict) {
+ JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "after", config->after, error, strict);
+ JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "before", config->before, error, strict);
+ JSONC_PARSE_TXT2ENUM_OR_ERROR_AND_RETURN(jobj, path, "time_group", time_grouping_txt2id, config->time_group, error, strict);
+ JSONC_PARSE_TXT2ENUM_OR_ERROR_AND_RETURN(jobj, path, "dims_group", alerts_dims_grouping2id, config->dims_group, error, strict);
+ JSONC_PARSE_TXT2ENUM_OR_ERROR_AND_RETURN(jobj, path, "data_source", alerts_data_sources2id, config->data_source, error, strict);
switch(config->time_group) {
default:
break;
case RRDR_GROUPING_COUNTIF:
- JSONC_PARSE_TXT2ENUM_OR_ERROR_AND_RETURN(jobj, path, "time_group_condition", alerts_group_condition2id, config->time_group_condition, error);
+ JSONC_PARSE_TXT2ENUM_OR_ERROR_AND_RETURN(jobj, path, "time_group_condition", alerts_group_condition2id, config->time_group_condition, error, strict);
// fall through
case RRDR_GROUPING_TRIMMED_MEAN:
case RRDR_GROUPING_TRIMMED_MEDIAN:
case RRDR_GROUPING_PERCENTILE:
- JSONC_PARSE_DOUBLE_OR_ERROR_AND_RETURN(jobj, path, "time_group_value", config->time_group_value, error);
+ JSONC_PARSE_DOUBLE_OR_ERROR_AND_RETURN(jobj, path, "time_group_value", config->time_group_value, error, strict);
break;
}
- JSONC_PARSE_ARRAY_OF_TXT2BITMAP_OR_ERROR_AND_RETURN(jobj, path, "options", rrdr_options_parse_one, config->options, error);
- JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "dimensions", config->dimensions, error, true);
+ JSONC_PARSE_ARRAY_OF_TXT2BITMAP_OR_ERROR_AND_RETURN(jobj, path, "options", rrdr_options_parse_one, config->options, error, strict);
+ JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "dimensions", config->dimensions, error, strict);
return true;
}
-static bool parse_config_value(json_object *jobj, const char *path, struct rrd_alert_config *config, BUFFER *error) {
- JSONC_PARSE_SUBOBJECT(jobj, path, "database_lookup", config, parse_config_value_database_lookup, error);
- JSONC_PARSE_TXT2EXPRESSION_OR_ERROR_AND_RETURN(jobj, path, "calculation", config->calculation, error);
- JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "units", config->units, error, true);
- JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "update_every", config->update_every, error);
+
+static bool parse_config_value(json_object *jobj, const char *path, struct rrd_alert_config *config, BUFFER *error, bool strict) {
+ JSONC_PARSE_SUBOBJECT(jobj, path, "database_lookup", config, parse_config_value_database_lookup, error, strict);
+ JSONC_PARSE_TXT2EXPRESSION_OR_ERROR_AND_RETURN(jobj, path, "calculation", config->calculation, error, false);
+ JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "units", config->units, error, false);
+ JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "update_every", config->update_every, error, strict);
return true;
}
-static bool parse_config_conditions(json_object *jobj, const char *path, struct rrd_alert_config *config, BUFFER *error) {
- JSONC_PARSE_TXT2EXPRESSION_OR_ERROR_AND_RETURN(jobj, path, "warning_condition", config->warning, error);
- JSONC_PARSE_TXT2EXPRESSION_OR_ERROR_AND_RETURN(jobj, path, "critical_condition", config->critical, error);
+static bool parse_config_conditions(json_object *jobj, const char *path, struct rrd_alert_config *config, BUFFER *error, bool strict) {
+ JSONC_PARSE_TXT2EXPRESSION_OR_ERROR_AND_RETURN(jobj, path, "warning_condition", config->warning, error, strict);
+ JSONC_PARSE_TXT2EXPRESSION_OR_ERROR_AND_RETURN(jobj, path, "critical_condition", config->critical, error, strict);
return true;
}
-static bool parse_config_action_delay(json_object *jobj, const char *path, struct rrd_alert_config *config, BUFFER *error) {
- JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "up", config->delay_up_duration, error);
- JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "down", config->delay_down_duration, error);
- JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "max", config->delay_max_duration, error);
- JSONC_PARSE_DOUBLE_OR_ERROR_AND_RETURN(jobj, path, "multiplier", config->delay_multiplier, error);
+static bool parse_config_action_delay(json_object *jobj, const char *path, struct rrd_alert_config *config, BUFFER *error, bool strict) {
+ JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "up", config->delay_up_duration, error, strict);
+ JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "down", config->delay_down_duration, error, strict);
+ JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "max", config->delay_max_duration, error, strict);
+ JSONC_PARSE_DOUBLE_OR_ERROR_AND_RETURN(jobj, path, "multiplier", config->delay_multiplier, error, strict);
return true;
}
-static bool parse_config_action_repeat(json_object *jobj, const char *path, struct rrd_alert_config *config, BUFFER *error) {
- JSONC_PARSE_BOOL_OR_ERROR_AND_RETURN(jobj, path, "enabled", config->has_custom_repeat_config, error);
- JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "warning", config->warn_repeat_every, error);
- JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "critical", config->crit_repeat_every, error);
+
+static bool parse_config_action_repeat(json_object *jobj, const char *path, struct rrd_alert_config *config, BUFFER *error, bool strict) {
+ JSONC_PARSE_BOOL_OR_ERROR_AND_RETURN(jobj, path, "enabled", config->has_custom_repeat_config, error, strict);
+ JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "warning", config->warn_repeat_every, error, strict);
+ JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "critical", config->crit_repeat_every, error, strict);
return true;
}
-static bool parse_config_action(json_object *jobj, const char *path, struct rrd_alert_config *config, BUFFER *error) {
- JSONC_PARSE_ARRAY_OF_TXT2BITMAP_OR_ERROR_AND_RETURN(jobj, path, "options", alert_action_options_parse_one, config->alert_action_options, error);
- JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "execute", config->exec, error, true);
- JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "recipient", config->recipient, error, true);
- JSONC_PARSE_SUBOBJECT(jobj, path, "delay", config, parse_config_action_delay, error);
- JSONC_PARSE_SUBOBJECT(jobj, path, "repeat", config, parse_config_action_repeat, error);
+static bool parse_config_action(json_object *jobj, const char *path, struct rrd_alert_config *config, BUFFER *error, bool strict) {
+ JSONC_PARSE_ARRAY_OF_TXT2BITMAP_OR_ERROR_AND_RETURN(jobj, path, "options", alert_action_options_parse_one, config->alert_action_options, error, strict);
+ JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "execute", config->exec, error, strict);
+ JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "recipient", config->recipient, error, strict);
+ JSONC_PARSE_SUBOBJECT(jobj, path, "delay", config, parse_config_action_delay, error, strict);
+ JSONC_PARSE_SUBOBJECT(jobj, path, "repeat", config, parse_config_action_repeat, error, strict);
return true;
}
-static bool parse_config(json_object *jobj, const char *path, RRD_ALERT_PROTOTYPE *ap, BUFFER *error) {
+static bool parse_config(json_object *jobj, const char *path, RRD_ALERT_PROTOTYPE *ap, BUFFER *error, bool strict) {
// we shouldn't parse these from the payload - they are given to us via the function call
- // JSONC_PARSE_TXT2ENUM_OR_ERROR_AND_RETURN(jobj, path, "source_type", dyncfg_source_type2id, ap->config.source_type, error);
- // JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "source", ap->config.source, error, true);
+ // JSONC_PARSE_TXT2ENUM_OR_ERROR_AND_RETURN(jobj, path, "source_type", dyncfg_source_type2id, ap->config.source_type, error, strict);
+ // JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "source", ap->config.source, error, strict);
- JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "summary", ap->config.summary, error, true);
- JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "info", ap->config.info, error, true);
- JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "type", ap->config.type, error, true);
- JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "component", ap->config.component, error, true);
- JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "classification", ap->config.classification, error, true);
+ JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "summary", ap->config.summary, error, false);
+ JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "info", ap->config.info, error, false);
+ JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "type", ap->config.type, error, false);
+ JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "component", ap->config.component, error, false);
+ JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "classification", ap->config.classification, error, false);
- JSONC_PARSE_SUBOBJECT(jobj, path, "value", &ap->config, parse_config_value, error);
- JSONC_PARSE_SUBOBJECT(jobj, path, "conditions", &ap->config, parse_config_conditions, error);
- JSONC_PARSE_SUBOBJECT(jobj, path, "action", &ap->config, parse_config_action, error);
- JSONC_PARSE_SUBOBJECT(jobj, path, "match", &ap->match, parse_match, error);
+ JSONC_PARSE_SUBOBJECT(jobj, path, "value", &ap->config, parse_config_value, error, strict);
+ JSONC_PARSE_SUBOBJECT(jobj, path, "conditions", &ap->config, parse_config_conditions, error, false);
+ JSONC_PARSE_SUBOBJECT(jobj, path, "action", &ap->config, parse_config_action, error, false);
+ JSONC_PARSE_SUBOBJECT(jobj, path, "match", &ap->match, parse_match, error, strict);
return true;
}
-static bool parse_prototype(json_object *jobj, const char *path, RRD_ALERT_PROTOTYPE *base, BUFFER *error, const char *name) {
- int64_t version;
- JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "format_version", version, error);
+static bool parse_prototype(json_object *jobj, const char *path, RRD_ALERT_PROTOTYPE *base, BUFFER *error, const char *name, bool strict) {
+ int64_t version = 0;
+ JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, "format_version", version, error, strict);
if(version != 1) {
buffer_sprintf(error, "unsupported document version");
return false;
}
- JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "name", base->config.name, error, !name && !*name);
+ JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, "name", base->config.name, error, !name && !*name && strict);
json_object *rules;
if (json_object_object_get_ex(jobj, "rules", &rules)) {
@@ -174,10 +176,10 @@ static bool parse_prototype(json_object *jobj, const char *path, RRD_ALERT_PROTO
json_object *rule = json_object_array_get_idx(rules, i);
- JSONC_PARSE_BOOL_OR_ERROR_AND_RETURN(rule, path, "enabled", ap->match.enabled, error);
+ JSONC_PARSE_BOOL_OR_ERROR_AND_RETURN(rule, path, "enabled", ap->match.enabled, error, strict);
STRING *type = NULL;
- JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(rule, path, "type", type, error, true);
+ JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(rule, path, "type", type, error, strict);
if(string_strcmp(type, "template") == 0)
ap->match.is_template = true;
else if(string_strcmp(type, "instance") == 0)
@@ -187,7 +189,7 @@ static bool parse_prototype(json_object *jobj, const char *path, RRD_ALERT_PROTO
return false;
}
- JSONC_PARSE_SUBOBJECT(rule, path, "config", ap, parse_config, error);
+ JSONC_PARSE_SUBOBJECT(rule, path, "config", ap, parse_config, error, strict);
ap = NULL; // so that we will create another one, if available
}
@@ -200,7 +202,7 @@ static bool parse_prototype(json_object *jobj, const char *path, RRD_ALERT_PROTO
return true;
}
-static RRD_ALERT_PROTOTYPE *health_prototype_payload_parse(const char *payload, size_t payload_len, BUFFER *error, const char *name) {
+static RRD_ALERT_PROTOTYPE *health_prototype_payload_parse(const char *payload, size_t payload_len, BUFFER *error, const char *name, bool strict) {
RRD_ALERT_PROTOTYPE *base = callocz(1, sizeof(*base));
CLEAN_JSON_OBJECT *jobj = NULL;
@@ -219,12 +221,17 @@ static RRD_ALERT_PROTOTYPE *health_prototype_payload_parse(const char *payload,
}
json_tokener_free(tokener);
- if(!parse_prototype(jobj, "", base, error, name))
+ if(!parse_prototype(jobj, "", base, error, name, strict))
goto cleanup;
if(!base->config.name && name)
base->config.name = string_strdupz(name);
+ if(name && *name && string_strcmp(base->config.name, name) != 0) {
+ string_freez(base->config.name);
+ base->config.name = string_strdupz(name);
+ }
+
int i = 1;
for(RRD_ALERT_PROTOTYPE *ap = base; ap; ap = ap->_internal.next, i++) {
if(ap->config.name != base->config.name) {
@@ -232,8 +239,8 @@ static RRD_ALERT_PROTOTYPE *health_prototype_payload_parse(const char *payload,
ap->config.name = string_dup(base->config.name);
}
- if(!RRDCALC_HAS_DB_LOOKUP(ap) && !ap->config.calculation) {
- buffer_sprintf(error, "the rule No %d has neither database lookup nor calculation", i);
+ if(!RRDCALC_HAS_DB_LOOKUP(ap) && !ap->config.calculation && strict) {
+ buffer_sprintf(error, "Item %d has neither database lookup nor calculation", i - 1);
goto cleanup;
}
@@ -244,13 +251,6 @@ static RRD_ALERT_PROTOTYPE *health_prototype_payload_parse(const char *payload,
base->_internal.enabled = true;
}
- if(string_strcmp(base->config.name, name) != 0) {
- buffer_sprintf(error,
- "name parsed ('%s') does not match the name of the alert prototype ('%s')",
- string2str(base->config.name), name);
- goto cleanup;
- }
-
return base;
cleanup:
@@ -370,6 +370,152 @@ void health_prototype_to_json(BUFFER *wb, RRD_ALERT_PROTOTYPE *ap, bool for_hash
// ---------------------------------------------------------------------------------------------------------------------
+static inline void dyncfg_user_config_print_duration(BUFFER *wb, const char *prefix, int seconds) {
+ if((seconds % 3600) == 0)
+ buffer_sprintf(wb, "%s%dh", prefix?prefix:"", seconds / 3600);
+ else if((seconds % 60) == 0)
+ buffer_sprintf(wb, "%s%dm", prefix?prefix:"", seconds / 60);
+ else
+ buffer_sprintf(wb, "%s%ds", prefix?prefix:"", seconds);
+}
+
+int dyncfg_health_prototype_to_conf(BUFFER *wb, RRD_ALERT_PROTOTYPE *ap, const char *name) {
+ buffer_flush(wb);
+ wb->content_type = CT_TEXT_PLAIN;
+ wb->expires = now_realtime_sec();
+
+ int n = 0;
+ for(RRD_ALERT_PROTOTYPE *nap = ap; nap ; nap = nap->_internal.next) {
+ if(++n > 1)
+ buffer_sprintf(wb, "\n");
+
+ if(nap->match.is_template) {
+ buffer_sprintf(wb, "%13s: %s\n", "template", name);
+ buffer_sprintf(wb, "%13s: %s\n", "on", string2str(nap->match.on.context));
+ }
+ else {
+ buffer_sprintf(wb, "%13s: %s\n", "alarm", name);
+ buffer_sprintf(wb, "%13s: %s\n", "on", string2str(nap->match.on.chart));
+ }
+
+ if(nap->config.classification)
+ buffer_sprintf(wb, "%13s: %s\n", "class", string2str(nap->config.classification));
+
+ if(nap->config.type)
+ buffer_sprintf(wb, "%13s: %s\n", "type", string2str(nap->config.type));
+
+ if(nap->config.component)
+ buffer_sprintf(wb, "%13s: %s\n", "component", string2str(nap->config.component));
+
+ if(nap->match.host_labels)
+ buffer_sprintf(wb, "%13s: %s\n", "host labels", string2str(nap->match.host_labels));
+
+ if(nap->match.chart_labels)
+ buffer_sprintf(wb, "%13s: %s\n", "chart labels", string2str(nap->match.chart_labels));
+
+ if(nap->config.after) {
+ buffer_sprintf(wb, "%13s: %s", "lookup", time_grouping_tostring(nap->config.time_group));
+ switch(nap->config.time_group) {
+ case RRDR_GROUPING_PERCENTILE:
+ case RRDR_GROUPING_TRIMMED_MEAN:
+ case RRDR_GROUPING_TRIMMED_MEDIAN:
+ buffer_sprintf(wb, "(%0.2f)", nap->config.time_group_value);
+ break;
+
+ case RRDR_GROUPING_COUNTIF:
+ buffer_sprintf(wb, "(%s%0.2f)", alerts_group_conditions_id2txt(nap->config.time_group_condition), nap->config.time_group_value);
+ break;
+
+ default:
+ break;
+ }
+
+ dyncfg_user_config_print_duration(wb, " ", nap->config.after);
+
+ if(nap->config.before)
+ dyncfg_user_config_print_duration(wb, " at ", nap->config.before);
+
+ if(nap->config.options) {
+ buffer_strcat(wb, " ");
+ rrdr_options_to_buffer(wb, nap->config.options);
+ }
+
+ if(nap->config.dimensions)
+ buffer_sprintf(wb, " of %s", string2str(nap->config.dimensions));
+
+ buffer_strcat(wb, "\n");
+ }
+
+ if(nap->config.calculation)
+ buffer_sprintf(wb, "%13s: %s\n", "calc", expression_source(nap->config.calculation));
+
+ if(nap->config.units)
+ buffer_sprintf(wb, "%13s: %s\n", "units", string2str(nap->config.units));
+
+ if(nap->config.update_every) {
+ buffer_sprintf(wb, "%13s: ", "every");
+ dyncfg_user_config_print_duration(wb, NULL, nap->config.update_every);
+ buffer_strcat(wb, "\n");
+ }
+
+ if(nap->config.warning)
+ buffer_sprintf(wb, "%13s: %s\n", "warn", expression_source(nap->config.warning));
+
+ if(nap->config.critical)
+ buffer_sprintf(wb, "%13s: %s\n", "crit", expression_source(nap->config.critical));
+
+ if(nap->config.delay_up_duration || nap->config.delay_down_duration) {
+ buffer_sprintf(wb, "%13s:", "delay");
+
+ if(nap->config.delay_up_duration)
+ dyncfg_user_config_print_duration(wb, " up ", nap->config.delay_up_duration);
+
+ if(nap->config.delay_down_duration)
+ dyncfg_user_config_print_duration(wb, " down ", nap->config.delay_down_duration);
+
+ if(nap->config.delay_multiplier)
+ buffer_sprintf(wb, " multiplier %0.2f", nap->config.delay_multiplier);
+
+ if(nap->config.delay_max_duration)
+ dyncfg_user_config_print_duration(wb, " max ", nap->config.delay_max_duration);
+
+ buffer_strcat(wb, "\n");
+ }
+
+ if(nap->config.alert_action_options) {
+ buffer_sprintf(wb, "%13s:", "options");
+ alert_action_options_to_buffer(wb, nap->config.alert_action_options);
+ buffer_strcat(wb, "\n");
+ }
+
+ if(nap->config.has_custom_repeat_config) {
+ if(!nap->config.crit_repeat_every && !nap->config.warn_repeat_every)
+ buffer_sprintf(wb, "%13s: off\n", "repeat");
+ else {
+ dyncfg_user_config_print_duration(wb, " warning ", (int)nap->config.warn_repeat_every);
+ dyncfg_user_config_print_duration(wb, " critical ", (int)nap->config.crit_repeat_every);
+ buffer_strcat(wb, "\n");
+ }
+ }
+
+ if(nap->config.summary)
+ buffer_sprintf(wb, "%13s: %s\n", "summary", string2str(nap->config.summary));
+
+ if(nap->config.info)
+ buffer_sprintf(wb, "%13s: %s\n", "info", string2str(nap->config.info));
+
+ if(nap->config.exec && nap->config.exec != localhost->health.health_default_exec)
+ buffer_sprintf(wb, "%13s: %s\n", "exec", string2str(nap->config.exec));
+
+ if(nap->config.recipient)
+ buffer_sprintf(wb, "%13s: %s\n", "to", string2str(nap->config.recipient));
+ }
+
+ return 200;
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+
static size_t dyncfg_health_remove_all_rrdcalc_of_prototype(STRING *alert_name) {
size_t removed = 0;
@@ -400,16 +546,19 @@ static int dyncfg_health_prototype_template_action(BUFFER *result, DYNCFG_CMDS c
switch(cmd) {
case DYNCFG_CMD_ADD: {
CLEAN_BUFFER *error = buffer_create(0, NULL);
- RRD_ALERT_PROTOTYPE *nap = health_prototype_payload_parse(buffer_tostring(payload), buffer_strlen(payload), error, add_name);
+ RRD_ALERT_PROTOTYPE *nap = health_prototype_payload_parse(buffer_tostring(payload), buffer_strlen(payload), error, add_name, true);
if(!nap)
code = dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, buffer_tostring(error));
else {
+ char *msg = "";
+
nap->config.source_type = DYNCFG_SOURCE_TYPE_DYNCFG;
- bool added = health_prototype_add(nap); // this swaps ap <-> nap
+ bool added = health_prototype_add(nap, &msg); // this swaps ap <-> nap
if(!added) {
health_prototype_free(nap);
- return dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, "required attributes are missing");
+ if(!msg || !*msg) msg = "required attributes are missing";
+ return dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, msg);
}
else
freez(nap);
@@ -430,6 +579,18 @@ static int dyncfg_health_prototype_template_action(BUFFER *result, DYNCFG_CMDS c
}
break;
+ case DYNCFG_CMD_USERCONFIG: {
+ CLEAN_BUFFER *error = buffer_create(0, NULL);
+ RRD_ALERT_PROTOTYPE *nap = health_prototype_payload_parse(buffer_tostring(payload), buffer_strlen(payload), error, add_name, false);
+ if(!nap)
+ code = dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, buffer_tostring(error));
+ else {
+ code = dyncfg_health_prototype_to_conf(result, nap, add_name);
+ health_prototype_free(nap);
+ }
+ }
+ break;
+
case DYNCFG_CMD_SCHEMA:
code = dyncfg_default_response(result, HTTP_RESP_NOT_IMPLEMENTED, "schema not implemented yet for prototype templates");
break;
@@ -513,16 +674,18 @@ static int dyncfg_health_prototype_job_action(BUFFER *result, DYNCFG_CMDS cmd, B
case DYNCFG_CMD_UPDATE: {
CLEAN_BUFFER *error = buffer_create(0, NULL);
- RRD_ALERT_PROTOTYPE *nap = health_prototype_payload_parse(buffer_tostring(payload), buffer_strlen(payload), error, alert_name);
+ RRD_ALERT_PROTOTYPE *nap = health_prototype_payload_parse(buffer_tostring(payload), buffer_strlen(payload), error, alert_name, true);
if(!nap)
code = dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, buffer_tostring(error));
else {
+ char *msg = "";
nap->config.source_type = DYNCFG_SOURCE_TYPE_DYNCFG;
- bool added = health_prototype_add(nap); // this swaps ap <-> nap
+ bool added = health_prototype_add(nap, &msg); // this swaps ap <-> nap
if(!added) {
health_prototype_free(nap);
- return dyncfg_default_response( result, HTTP_RESP_BAD_REQUEST, "required attributes are missing");
+ if(!msg || !*msg) msg = "required attributes are missing";
+ return dyncfg_default_response( result, HTTP_RESP_BAD_REQUEST, msg);
}
else
freez(nap);
@@ -534,6 +697,18 @@ static int dyncfg_health_prototype_job_action(BUFFER *result, DYNCFG_CMDS cmd, B
}
break;
+ case DYNCFG_CMD_USERCONFIG: {
+ CLEAN_BUFFER *error = buffer_create(0, NULL);
+ RRD_ALERT_PROTOTYPE *nap = health_prototype_payload_parse(buffer_tostring(payload), buffer_strlen(payload), error, alert_name, false);
+ if(!nap)
+ code = dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, buffer_tostring(error));
+ else {
+ code = dyncfg_health_prototype_to_conf(result, nap, alert_name);
+ health_prototype_free(nap);
+ }
+ }
+ break;
+
case DYNCFG_CMD_REMOVE:
dyncfg_health_remove_all_rrdcalc_of_prototype(ap->config.name);
dictionary_del(health_globals.prototypes.dict, dictionary_acquired_item_name(item));
@@ -621,7 +796,7 @@ static void health_dyncfg_register_prototype(RRD_ALERT_PROTOTYPE *ap) {
ap->_internal.enabled ? DYNCFG_STATUS_ACCEPTED : DYNCFG_STATUS_DISABLED, DYNCFG_TYPE_JOB,
ap->config.source_type, string2str(ap->config.source),
DYNCFG_CMD_SCHEMA | DYNCFG_CMD_GET | DYNCFG_CMD_ENABLE | DYNCFG_CMD_DISABLE |
- DYNCFG_CMD_UPDATE |
+ DYNCFG_CMD_UPDATE | DYNCFG_CMD_USERCONFIG |
(ap->config.source_type == DYNCFG_SOURCE_TYPE_DYNCFG && !ap->_internal.is_on_disk ? DYNCFG_CMD_REMOVE : 0),
HTTP_ACCESS_NONE,
HTTP_ACCESS_NONE,
@@ -654,7 +829,7 @@ void health_dyncfg_register_all_prototypes(void) {
DYNCFG_HEALTH_ALERT_PROTOTYPE_PREFIX, "/health/alerts/prototypes",
DYNCFG_STATUS_ACCEPTED, DYNCFG_TYPE_TEMPLATE,
DYNCFG_SOURCE_TYPE_INTERNAL, "internal",
- DYNCFG_CMD_SCHEMA | DYNCFG_CMD_ADD | DYNCFG_CMD_ENABLE | DYNCFG_CMD_DISABLE,
+ DYNCFG_CMD_SCHEMA | DYNCFG_CMD_ADD | DYNCFG_CMD_ENABLE | DYNCFG_CMD_DISABLE | DYNCFG_CMD_USERCONFIG,
HTTP_ACCESS_NONE,
HTTP_ACCESS_NONE,
dyncfg_health_cb, NULL);
diff --git a/src/health/health_event_loop.c b/src/health/health_event_loop.c
index a4b8caff3..756ffa165 100644
--- a/src/health/health_event_loop.c
+++ b/src/health/health_event_loop.c
@@ -201,6 +201,7 @@ static void health_event_loop(void) {
"Postponing alarm checks for %"PRId32" seconds, "
"because it seems that the system was just resumed from suspension.",
(int32_t)health_globals.config.postpone_alarms_during_hibernation_for_seconds);
+ schedule_node_info_update(localhost);
}
if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) {
@@ -286,6 +287,7 @@ static void health_event_loop(void) {
if (unlikely(rc->rrdset && rc->status != RRDCALC_STATUS_REMOVED &&
rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) &&
now > (rc->rrdset->last_collected_time.tv_sec + 60))) {
+
if (!rrdcalc_isrepeating(rc)) {
worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
time_t now_tmp = now_realtime_sec();
@@ -738,16 +740,16 @@ static void health_event_loop(void) {
}
-static void health_main_cleanup(void *ptr) {
- worker_unregister();
+static void health_main_cleanup(void *pptr) {
+ struct netdata_static_thread *static_thread = CLEANUP_FUNCTION_GET_PTR(pptr);
+ if(!static_thread) return;
- struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
+ worker_unregister();
static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
netdata_log_info("cleaning up...");
static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
- nd_log(NDLS_DAEMON, NDLP_DEBUG,
- "Health thread ended.");
+ nd_log(NDLS_DAEMON, NDLP_DEBUG, "Health thread ended.");
}
void *health_main(void *ptr) {
@@ -763,10 +765,7 @@ void *health_main(void *ptr) {
worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET, "rrdset init");
worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM, "rrddim init");
- netdata_thread_cleanup_push(health_main_cleanup, ptr);
- {
- health_event_loop();
- }
- netdata_thread_cleanup_pop(1);
+ CLEANUP_FUNCTION_REGISTER(health_main_cleanup) cleanup_ptr = ptr;
+ health_event_loop();
return NULL;
}
diff --git a/src/health/health_internals.h b/src/health/health_internals.h
index d24a2422b..638a96195 100644
--- a/src/health/health_internals.h
+++ b/src/health/health_internals.h
@@ -44,6 +44,7 @@
#define HEALTH_CHART_LABEL_KEY "chart labels"
void alert_action_options_to_buffer_json_array(BUFFER *wb, const char *key, ALERT_ACTION_OPTIONS options);
+void alert_action_options_to_buffer(BUFFER *wb, ALERT_ACTION_OPTIONS options);
ALERT_ACTION_OPTIONS alert_action_options_parse(char *o);
ALERT_ACTION_OPTIONS alert_action_options_parse_one(const char *o);
@@ -59,7 +60,7 @@ typedef struct rrd_alert_prototype {
struct rrd_alert_prototype *prev, *next;
} _internal;
} RRD_ALERT_PROTOTYPE;
-bool health_prototype_add(RRD_ALERT_PROTOTYPE *ap);
+bool health_prototype_add(RRD_ALERT_PROTOTYPE *ap, char **msg);
void health_prototype_cleanup(RRD_ALERT_PROTOTYPE *ap);
void health_prototype_free(RRD_ALERT_PROTOTYPE *ap);
diff --git a/src/health/health_log.c b/src/health/health_log.c
index 8839b2da5..b04f8f248 100644
--- a/src/health/health_log.c
+++ b/src/health/health_log.c
@@ -10,6 +10,8 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
void health_log_alert_transition_with_trace(RRDHOST *host, ALARM_ENTRY *ae, int line, const char *file, const char *function) {
+ if(!host || !ae) return;
+
ND_LOG_STACK lgs[] = {
ND_LOG_FIELD_UUID(NDF_MESSAGE_ID, &health_alert_transition_msgid),
ND_LOG_FIELD_STR(NDF_NIDL_NODE, host->hostname),
diff --git a/src/health/health_notifications.c b/src/health/health_notifications.c
index c986ba292..79426f48c 100644
--- a/src/health/health_notifications.c
+++ b/src/health/health_notifications.c
@@ -108,7 +108,7 @@ static bool prepare_command(BUFFER *wb,
const char *classification,
const char *edit_command,
const char *machine_guid,
- uuid_t *transition_id,
+ nd_uuid_t *transition_id,
const char *summary,
const char *context,
const char *component,
@@ -479,7 +479,7 @@ done:
health_alarm_log_save(host, ae);
}
-bool health_alarm_log_get_global_id_and_transition_id_for_rrdcalc(RRDCALC *rc, usec_t *global_id, uuid_t *transitions_id) {
+bool health_alarm_log_get_global_id_and_transition_id_for_rrdcalc(RRDCALC *rc, usec_t *global_id, nd_uuid_t *transitions_id) {
if(!rc->rrdset)
return false;
diff --git a/src/health/health_prototypes.c b/src/health/health_prototypes.c
index ceefbc222..c43096115 100644
--- a/src/health/health_prototypes.c
+++ b/src/health/health_prototypes.c
@@ -176,6 +176,21 @@ void alert_action_options_to_buffer_json_array(BUFFER *wb, const char *key, ALER
buffer_json_array_close(wb);
}
+void alert_action_options_to_buffer(BUFFER *wb, ALERT_ACTION_OPTIONS options) {
+ RRDR_OPTIONS used = 0; // to prevent adding duplicates
+ for(int i = 0; alert_action_options[i].name ; i++) {
+ if (unlikely((alert_action_options[i].value & options) && !(alert_action_options[i].value & used))) {
+ if(used != 0)
+ buffer_strcat(wb, " ");
+
+ const char *name = alert_action_options[i].name;
+ used |= alert_action_options[i].value;
+
+ buffer_strcat(wb, name);
+ }
+ }
+}
+
static void alert_action_options_init(void) {
for(int i = 0; alert_action_options[i].name ; i++)
alert_action_options[i].hash = simple_hash(alert_action_options[i].name);
@@ -374,18 +389,20 @@ static void health_prototype_activate_match_patterns(struct rrd_alert_match *am)
void health_prototype_hash_id(RRD_ALERT_PROTOTYPE *ap) {
CLEAN_BUFFER *wb = buffer_create(100, NULL);
health_prototype_to_json(wb, ap, true);
- UUID uuid = UUID_generate_from_hash(buffer_tostring(wb), buffer_strlen(wb));
+ ND_UUID uuid = UUID_generate_from_hash(buffer_tostring(wb), buffer_strlen(wb));
uuid_copy(ap->config.hash_id, uuid.uuid);
- (void) sql_alert_store_config(ap);
+ sql_alert_store_config(ap);
}
-bool health_prototype_add(RRD_ALERT_PROTOTYPE *ap) {
+bool health_prototype_add(RRD_ALERT_PROTOTYPE *ap, char **msg) {
if(!ap->match.is_template) {
if(!ap->match.on.chart) {
netdata_log_error(
"HEALTH: alert '%s' does not define a instance (parameter 'on'). Source: %s",
string2str(ap->config.name), string2str(ap->config.source));
+ if(msg)
+ *msg = "missing match 'on' parameter for instance";
return false;
}
}
@@ -394,6 +411,8 @@ bool health_prototype_add(RRD_ALERT_PROTOTYPE *ap) {
netdata_log_error(
"HEALTH: alert '%s' does not define a context (parameter 'on'). Source: %s",
string2str(ap->config.name), string2str(ap->config.source));
+ if(msg)
+ *msg = "missing match 'on' parameter for context";
return false;
}
}
@@ -402,6 +421,8 @@ bool health_prototype_add(RRD_ALERT_PROTOTYPE *ap) {
netdata_log_error(
"HEALTH: alert '%s' has no frequency (parameter 'every'). Source: %s",
string2str(ap->config.name), string2str(ap->config.source));
+ if(msg)
+ *msg = "missing update frequency";
return false;
}
@@ -409,6 +430,8 @@ bool health_prototype_add(RRD_ALERT_PROTOTYPE *ap) {
netdata_log_error(
"HEALTH: alert '%s' is useless (no db lookup, no calculation, no warning and no critical expressions). Source: %s",
string2str(ap->config.name), string2str(ap->config.source));
+ if(msg)
+ *msg = "no db lookup, calculation and warning/critical conditions";
return false;
}
@@ -575,18 +598,27 @@ static void health_prototype_apply_to_rrdset(RRDSET *st, RRD_ALERT_PROTOTYPE *ap
return;
spinlock_lock(&ap->_internal.spinlock);
- for(RRD_ALERT_PROTOTYPE *t = ap; t ; t = t->_internal.next) {
- if(!t->match.enabled)
- continue;
+ for(size_t template = 0; template < 2; template++) {
+ bool want_template = template ? true : false;
+
+ for (RRD_ALERT_PROTOTYPE *t = ap; t; t = t->_internal.next) {
+ if (!t->match.enabled)
+ continue;
- if(!prototype_matches_host(st->rrdhost, t))
- continue;
+ bool is_template = t->match.is_template ? true : false;
- if(!prototype_matches_rrdset(st, t))
- continue;
+ if (is_template != want_template)
+ continue;
- if(rrdcalc_add_from_prototype(st->rrdhost, st, t))
- ap->_internal.uses++;
+ if (!prototype_matches_host(st->rrdhost, t))
+ continue;
+
+ if (!prototype_matches_rrdset(st, t))
+ continue;
+
+ if (rrdcalc_add_from_prototype(st->rrdhost, st, t))
+ ap->_internal.uses++;
+ }
}
spinlock_unlock(&ap->_internal.spinlock);
}
diff --git a/src/health/health_prototypes.h b/src/health/health_prototypes.h
index cbb5dba06..e226c1929 100644
--- a/src/health/health_prototypes.h
+++ b/src/health/health_prototypes.h
@@ -57,7 +57,7 @@ struct rrd_alert_match {
void rrd_alert_match_cleanup(struct rrd_alert_match *am);
struct rrd_alert_config {
- uuid_t hash_id;
+ nd_uuid_t hash_id;
STRING *name; // the name of this alarm
diff --git a/src/health/notifications/README.md b/src/health/notifications/README.md
index 5a0b74045..5a2b032a3 100644
--- a/src/health/notifications/README.md
+++ b/src/health/notifications/README.md
@@ -10,10 +10,10 @@ The default script is `alarm-notify.sh`.
>
> This file mentions editing configuration files.
>
-> - To edit configuration files in a safe way, we provide the [`edit config` script](https://github.com/netdata/netdata/blob/master/docs/configure/nodes.md#use-edit-config-to-edit-configuration-files) located in your [Netdata config directory](https://github.com/netdata/netdata/blob/master/docs/configure/nodes.md#the-netdata-config-directory) (typically is `/etc/netdata`) that creates the proper file and opens it in an editor automatically.
+> - To edit configuration files in a safe way, we provide the [`edit config` script](/docs/netdata-agent/configuration/README.md#edit-netdataconf)located in your [Netdata config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory) (typically is `/etc/netdata`) that creates the proper file and opens it in an editor automatically.
> Note that to run the script you need to be inside your Netdata config directory.
>
-> - Please also note that after most configuration changes you will need to [restart the Agent](https://github.com/netdata/netdata/blob/master/packaging/installer/README.md#maintaining-a-netdata-agent-installation) for the changes to take effect.
+> - Please also note that after most configuration changes you will need to [restart the Agent](/packaging/installer/README.md#maintaining-a-netdata-agent-installation) for the changes to take effect.
>
> It is recommended to use this way for configuring Netdata.
@@ -113,7 +113,7 @@ export NETDATA_ALARM_NOTIFY_DEBUG=1
/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE"
```
-If you are [running your own registry](https://github.com/netdata/netdata/blob/master/src/registry/README.md#run-your-own-registry), add `export NETDATA_REGISTRY_URL=[YOUR_URL]` before calling `alarm-notify.sh`.
+If you are [running your own registry](/src/registry/README.md#run-your-own-registry), add `export NETDATA_REGISTRY_URL=[YOUR_URL]` before calling `alarm-notify.sh`.
> If you need to dig even deeper, you can trace the execution with `bash -x`. Note that in test mode, `alarm-notify.sh` calls itself with many more arguments. So first do:
>
diff --git a/src/health/notifications/alerta/README.md b/src/health/notifications/alerta/README.md
index e0e2d34da..40fef3fd7 100644
--- a/src/health/notifications/alerta/README.md
+++ b/src/health/notifications/alerta/README.md
@@ -40,7 +40,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -50,7 +50,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/awssns/README.md b/src/health/notifications/awssns/README.md
index 350ef6b8d..b5a4cc5f4 100644
--- a/src/health/notifications/awssns/README.md
+++ b/src/health/notifications/awssns/README.md
@@ -57,7 +57,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -67,7 +67,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/custom/README.md b/src/health/notifications/custom/README.md
index 47af067ec..785aec59d 100644
--- a/src/health/notifications/custom/README.md
+++ b/src/health/notifications/custom/README.md
@@ -37,7 +37,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -47,7 +47,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/discord/README.md b/src/health/notifications/discord/README.md
index c6200e45c..128e04a44 100644
--- a/src/health/notifications/discord/README.md
+++ b/src/health/notifications/discord/README.md
@@ -39,7 +39,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -49,7 +49,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/dynatrace/README.md b/src/health/notifications/dynatrace/README.md
index 4e70f9016..6785cdb82 100644
--- a/src/health/notifications/dynatrace/README.md
+++ b/src/health/notifications/dynatrace/README.md
@@ -42,7 +42,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -52,7 +52,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/email/README.md b/src/health/notifications/email/README.md
index bc469c7af..1e831d58e 100644
--- a/src/health/notifications/email/README.md
+++ b/src/health/notifications/email/README.md
@@ -38,7 +38,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -48,7 +48,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/flock/README.md b/src/health/notifications/flock/README.md
index 8bd9d5cf9..332ede832 100644
--- a/src/health/notifications/flock/README.md
+++ b/src/health/notifications/flock/README.md
@@ -38,7 +38,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -48,7 +48,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/gotify/README.md b/src/health/notifications/gotify/README.md
index ec3b13df1..f0f8a7edb 100644
--- a/src/health/notifications/gotify/README.md
+++ b/src/health/notifications/gotify/README.md
@@ -39,7 +39,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -49,7 +49,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/irc/README.md b/src/health/notifications/irc/README.md
index d1d062a0e..76d3f5bc2 100644
--- a/src/health/notifications/irc/README.md
+++ b/src/health/notifications/irc/README.md
@@ -38,7 +38,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -48,7 +48,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/kavenegar/README.md b/src/health/notifications/kavenegar/README.md
index 408e2e3a9..eedd43a23 100644
--- a/src/health/notifications/kavenegar/README.md
+++ b/src/health/notifications/kavenegar/README.md
@@ -39,7 +39,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -49,7 +49,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/matrix/README.md b/src/health/notifications/matrix/README.md
index 491806d6b..3c01a9ef2 100644
--- a/src/health/notifications/matrix/README.md
+++ b/src/health/notifications/matrix/README.md
@@ -27,7 +27,7 @@ Send notifications to Matrix network rooms using Netdata's Agent alert notificat
- The url of the homeserver (`https://homeserver:port`).
- Credentials for connecting to the homeserver, in the form of a valid access token for your account (or for a dedicated notification account). These tokens usually don't expire.
-- The room ids that you want to sent the notification to.
+- The Room ids that you want to sent the notification to.
- Access to the terminal where Netdata Agent is running
@@ -40,7 +40,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -50,14 +50,14 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
| SEND_MATRIX | Set `SEND_MATRIX` to YES | YES | yes |
| MATRIX_HOMESERVER | set `MATRIX_HOMESERVER` to the URL of the Matrix homeserver. | | yes |
| MATRIX_ACCESSTOKEN | Set `MATRIX_ACCESSTOKEN` to the access token from your Matrix account. | | yes |
-| DEFAULT_RECIPIENT_MATRIX | Set `DEFAULT_RECIPIENT_MATRIX` to the rooms you want the alert notifications to be sent to. The format is `!roomid:homeservername`. | | yes |
+| DEFAULT_RECIPIENT_MATRIX | Set `DEFAULT_RECIPIENT_MATRIX` to the Rooms you want the alert notifications to be sent to. The format is `!roomid:homeservername`. | | yes |
##### MATRIX_ACCESSTOKEN
@@ -69,13 +69,13 @@ curl -XPOST -d '{"type":"m.login.password", "user":"example", "password":"wordpa
##### DEFAULT_RECIPIENT_MATRIX
-The room ids are unique identifiers and can be obtained from the room settings in a Matrix client (e.g. Riot).
+The Room ids are unique identifiers and can be obtained from the Room settings in a Matrix client (e.g. Riot).
-You can define multiple rooms like this: `!roomid1:homeservername` `!roomid2:homeservername`.
+You can define multiple Rooms like this: `!roomid1:homeservername` `!roomid2:homeservername`.
All roles will default to this variable if left unconfigured.
-You can have different rooms per role, by editing `DEFAULT_RECIPIENT_MATRIX` with the `!roomid:homeservername` you want, in the following entries at the bottom of the same file:
+You can have different Rooms per role, by editing `DEFAULT_RECIPIENT_MATRIX` with the `!roomid:homeservername` you want, in the following entries at the bottom of the same file:
```conf
role_recipients_matrix[sysadmin]="!roomid1:homeservername"
diff --git a/src/health/notifications/matrix/metadata.yaml b/src/health/notifications/matrix/metadata.yaml
index 17135aa3a..db7f92eb1 100644
--- a/src/health/notifications/matrix/metadata.yaml
+++ b/src/health/notifications/matrix/metadata.yaml
@@ -20,7 +20,7 @@
description: |
- The url of the homeserver (`https://homeserver:port`).
- Credentials for connecting to the homeserver, in the form of a valid access token for your account (or for a dedicated notification account). These tokens usually don't expire.
- - The room ids that you want to sent the notification to.
+ - The Room ids that you want to sent the notification to.
- Access to the terminal where Netdata Agent is running
configuration:
file:
@@ -50,16 +50,16 @@
```
- name: 'DEFAULT_RECIPIENT_MATRIX'
default_value: ''
- description: "Set `DEFAULT_RECIPIENT_MATRIX` to the rooms you want the alert notifications to be sent to. The format is `!roomid:homeservername`."
+ description: "Set `DEFAULT_RECIPIENT_MATRIX` to the Rooms you want the alert notifications to be sent to. The format is `!roomid:homeservername`."
required: true
detailed_description: |
- The room ids are unique identifiers and can be obtained from the room settings in a Matrix client (e.g. Riot).
+ The Room ids are unique identifiers and can be obtained from the Room settings in a Matrix client (e.g. Riot).
- You can define multiple rooms like this: `!roomid1:homeservername` `!roomid2:homeservername`.
+ You can define multiple Rooms like this: `!roomid1:homeservername` `!roomid2:homeservername`.
All roles will default to this variable if left unconfigured.
- You can have different rooms per role, by editing `DEFAULT_RECIPIENT_MATRIX` with the `!roomid:homeservername` you want, in the following entries at the bottom of the same file:
+ You can have different Rooms per role, by editing `DEFAULT_RECIPIENT_MATRIX` with the `!roomid:homeservername` you want, in the following entries at the bottom of the same file:
```conf
role_recipients_matrix[sysadmin]="!roomid1:homeservername"
diff --git a/src/health/notifications/messagebird/README.md b/src/health/notifications/messagebird/README.md
index a20654147..4b668fce3 100644
--- a/src/health/notifications/messagebird/README.md
+++ b/src/health/notifications/messagebird/README.md
@@ -38,7 +38,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -48,7 +48,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/msteams/README.md b/src/health/notifications/msteams/README.md
index 4d6466afc..e24730777 100644
--- a/src/health/notifications/msteams/README.md
+++ b/src/health/notifications/msteams/README.md
@@ -39,7 +39,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -49,7 +49,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/ntfy/README.md b/src/health/notifications/ntfy/README.md
index 6cfdee4c9..a03e30304 100644
--- a/src/health/notifications/ntfy/README.md
+++ b/src/health/notifications/ntfy/README.md
@@ -40,7 +40,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -50,7 +50,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/opsgenie/README.md b/src/health/notifications/opsgenie/README.md
index b82f36a4f..fa5859d7d 100644
--- a/src/health/notifications/opsgenie/README.md
+++ b/src/health/notifications/opsgenie/README.md
@@ -39,7 +39,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -49,7 +49,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/pagerduty/README.md b/src/health/notifications/pagerduty/README.md
index 730f73168..ae45e5385 100644
--- a/src/health/notifications/pagerduty/README.md
+++ b/src/health/notifications/pagerduty/README.md
@@ -41,7 +41,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -51,7 +51,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/prowl/README.md b/src/health/notifications/prowl/README.md
index bc9b2b350..0d206cee0 100644
--- a/src/health/notifications/prowl/README.md
+++ b/src/health/notifications/prowl/README.md
@@ -44,7 +44,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -54,7 +54,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/pushbullet/README.md b/src/health/notifications/pushbullet/README.md
index 70e30ea3c..1b30f4c97 100644
--- a/src/health/notifications/pushbullet/README.md
+++ b/src/health/notifications/pushbullet/README.md
@@ -38,7 +38,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -48,7 +48,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/pushover/README.md b/src/health/notifications/pushover/README.md
index 6fe59c9ac..9d30dfa97 100644
--- a/src/health/notifications/pushover/README.md
+++ b/src/health/notifications/pushover/README.md
@@ -42,7 +42,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -52,7 +52,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/rocketchat/README.md b/src/health/notifications/rocketchat/README.md
index 4ff51a917..b9b0d5687 100644
--- a/src/health/notifications/rocketchat/README.md
+++ b/src/health/notifications/rocketchat/README.md
@@ -39,7 +39,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -49,7 +49,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/slack/README.md b/src/health/notifications/slack/README.md
index 3b26f01b4..35cb75a18 100644
--- a/src/health/notifications/slack/README.md
+++ b/src/health/notifications/slack/README.md
@@ -39,7 +39,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -49,7 +49,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/smstools3/README.md b/src/health/notifications/smstools3/README.md
index 0931a338c..dafc0b7f4 100644
--- a/src/health/notifications/smstools3/README.md
+++ b/src/health/notifications/smstools3/README.md
@@ -43,7 +43,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -53,7 +53,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/syslog/README.md b/src/health/notifications/syslog/README.md
index 7aac4dc83..72534b1c8 100644
--- a/src/health/notifications/syslog/README.md
+++ b/src/health/notifications/syslog/README.md
@@ -38,7 +38,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -48,7 +48,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/telegram/README.md b/src/health/notifications/telegram/README.md
index 2ceef6963..e263d0bb5 100644
--- a/src/health/notifications/telegram/README.md
+++ b/src/health/notifications/telegram/README.md
@@ -39,7 +39,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -49,7 +49,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/notifications/twilio/README.md b/src/health/notifications/twilio/README.md
index 2037c1f3b..cd9b17e7f 100644
--- a/src/health/notifications/twilio/README.md
+++ b/src/health/notifications/twilio/README.md
@@ -38,7 +38,7 @@ The configuration file name for this integration is `health_alarm_notify.conf`.
You can edit the configuration file using the `edit-config` script from the
-Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
@@ -48,7 +48,7 @@ sudo ./edit-config health_alarm_notify.conf
The following options can be defined for this notification
-<details><summary>Config Options</summary>
+<details open><summary>Config Options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
diff --git a/src/health/rrdcalc.c b/src/health/rrdcalc.c
index 445e61b80..bce709bf4 100644
--- a/src/health/rrdcalc.c
+++ b/src/health/rrdcalc.c
@@ -60,13 +60,13 @@ inline const char *rrdcalc_status2string(RRDCALC_STATUS status) {
}
}
-uint32_t rrdcalc_get_unique_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id, uuid_t *config_hash_id) {
+uint32_t rrdcalc_get_unique_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id, nd_uuid_t *config_hash_id) {
rw_spinlock_read_lock(&host->health_log.spinlock);
// re-use old IDs, by looking them up in the alarm log
ALARM_ENTRY *ae = NULL;
for(ae = host->health_log.alarms; ae ;ae = ae->next) {
- if(unlikely(name == ae->name && chart == ae->chart && !uuid_memcmp(&ae->config_hash_id, config_hash_id))) {
+ if(unlikely(name == ae->name && chart == ae->chart && uuid_eq(ae->config_hash_id, *config_hash_id))) {
if(next_event_id) *next_event_id = ae->alarm_event_id + 1;
break;
}
@@ -241,6 +241,7 @@ static void rrdcalc_link_to_rrdset(RRDCALC *rc) {
0,
rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0);
+ health_log_alert(host, ae);
health_alarm_log_add_entry(host, ae);
rrdset_flag_set(st, RRDSET_FLAG_HAS_RRDCALC_LINKED);
@@ -273,6 +274,7 @@ static void rrdcalc_unlink_from_rrdset(RRDCALC *rc, bool having_ll_wrlock) {
0,
0);
+ health_log_alert(host, ae);
health_alarm_log_add_entry(host, ae);
}
@@ -463,6 +465,17 @@ void rrdcalc_delete_all(RRDHOST *host) {
dictionary_flush(host->rrdcalc_root_index);
}
+void rrdcalc_child_disconnected(RRDHOST *host) {
+ rrdcalc_delete_all(host);
+
+ rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION);
+ RRDSET *st;
+ rrdset_foreach_read(st, host) {
+ rrdset_flag_clear(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION);
+ }
+ rrdset_foreach_done(st);
+}
+
void rrd_alert_match_cleanup(struct rrd_alert_match *am) {
if(am->is_template)
string_freez(am->on.context);
diff --git a/src/health/rrdcalc.h b/src/health/rrdcalc.h
index 5c998b310..3a7951a73 100644
--- a/src/health/rrdcalc.h
+++ b/src/health/rrdcalc.h
@@ -121,7 +121,7 @@ RRDCALC *rrdcalc_acquired_to_rrdcalc(const RRDCALC_ACQUIRED *rca);
const char *rrdcalc_status2string(RRDCALC_STATUS status);
-uint32_t rrdcalc_get_unique_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id, uuid_t *config_hash_id);
+uint32_t rrdcalc_get_unique_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id, nd_uuid_t *config_hash_id);
static inline int rrdcalc_isrepeating(RRDCALC *rc) {
if (unlikely(rc->config.warn_repeat_every > 0 || rc->config.crit_repeat_every > 0)) {
@@ -143,4 +143,6 @@ void rrdcalc_unlink_and_delete(RRDHOST *host, RRDCALC *rc, bool having_ll_wrlock
#define RRDCALC_VAR_LABEL "${label:"
#define RRDCALC_VAR_LABEL_LEN (sizeof(RRDCALC_VAR_LABEL)-1)
+void rrdcalc_child_disconnected(RRDHOST *host);
+
#endif //NETDATA_RRDCALC_H
diff --git a/src/health/rrdvar.c b/src/health/rrdvar.c
index aff318c12..4e28e62a3 100644
--- a/src/health/rrdvar.c
+++ b/src/health/rrdvar.c
@@ -12,7 +12,7 @@ typedef struct rrdvar {
inline int rrdvar_fix_name(char *variable) {
int fixed = 0;
while(*variable) {
- if (!isalnum(*variable) && *variable != '.' && *variable != '_') {
+ if (!isalnum((uint8_t)*variable) && *variable != '.' && *variable != '_') {
*variable++ = '_';
fixed++;
}
diff --git a/src/health/schema.d/health%3Aalert%3Aprototype.json b/src/health/schema.d/health%3Aalert%3Aprototype.json
new file mode 100644
index 000000000..309d052de
--- /dev/null
+++ b/src/health/schema.d/health%3Aalert%3Aprototype.json
@@ -0,0 +1,687 @@
+{
+ "jsonSchema": {
+ "$schema": "http://json-schema.org/draft-07/schema#",
+ "definitions": {
+ "matchInstanceLabels": {
+ "type": "string",
+ "default": "*",
+ "title": "Only for instances with these labels"
+ },
+ "matchHostLabels": {
+ "type": "string",
+ "default": "*",
+ "title": "Only for nodes with these host labels"
+ },
+ "matchInstance": {
+ "type": "object",
+ "title": "Apply this rule to a single instance",
+ "description": "This is a single alert rule that will be applied to the specific instance on all nodes hosted on this Netdata.",
+ "properties": {
+ "on": {
+ "type": "string",
+ "default": "",
+ "title": "The instance this rule should be applied to.",
+ "description": "You can find the instance names on all charts at the instances drop down menu. Do not include the host name in this field."
+ },
+ "host_labels": { "$ref": "#/definitions/matchHostLabels" },
+ "instance_labels": { "$ref": "#/definitions/matchInstanceLabels" }
+ },
+ "required": [
+ "on",
+ "host_labels",
+ "instance_labels"
+ ]
+ },
+ "matchTemplate": {
+ "type": "object",
+ "title": "Apply this rule to all instances of a context",
+ "description": "This rule defines a template, that will apply this alert to all instances (e.g. disks, network interfaces, nginx servers, etc) on all nodes hosted on this Netdata.",
+ "properties": {
+ "on": {
+ "type": "string",
+ "default": "",
+ "title": "The context of the instances this rule should be applied to.",
+ "description": "The context is the code-name of each chart on the dashboard, that appears at the chart title bar, between the chart title and its unit of measurement, like: system.cpu, disk.io, etc."
+ },
+ "host_labels": { "$ref": "#/definitions/matchHostLabels" },
+ "instance_labels": { "$ref": "#/definitions/matchInstanceLabels" }
+ },
+ "required": [
+ "on",
+ "host_labels",
+ "instance_labels"
+ ]
+ },
+ "configSummary": {
+ "type": "string",
+ "title": "Short description of the alert",
+ "description": "This field is used in notification as a short description of the alert. Variables, like ${label:key}, are replaced with the value of instance label called 'key'."
+ },
+ "configInfo": {
+ "type": "string",
+ "title": "Long description of the alert",
+ "description": "This field is used to provide enough information about the type and nature of the alert. Variables, like ${label:key}, are replaced with the value of instance label called 'key'."
+ },
+ "configType": {
+ "type": "string",
+ "title": "Alert Type",
+ "description": "Use categories like: 'System', 'Containers', 'Web Servers', 'Message Brokers', etc."
+ },
+ "configComponent": {
+ "type": "string",
+ "title": "Alert Component",
+ "description": "Component is a sub-type of Alert Type. Examples: 'CPU', 'Memory', 'Network', 'Disk', 'Hardware', 'nginx', 'redis', 'postgresql', etc."
+ },
+ "configClassification": {
+ "type": "string",
+ "title": "Classification",
+ "description": "Use 'Workload', 'Utilization', 'Latency', 'Availability', 'Errors', etc."
+ },
+ "configValue": {
+ "type": "object",
+ "title": "",
+ "description": "Each alert has a value. This section defines how this value is calculated.",
+ "properties": {
+ "database_lookup": {
+ "type": "object",
+ "title": "Database Query to Get Value",
+ "description": "The database query to be executed to calculate the value of the alert. When set, the query is executed before any other calculations. The result of the query will be available as $this in further calculations.",
+ "properties": {
+ "data_source": {
+ "type": "string",
+ "oneOf": [
+ { "const": "samples", "title": "Samples", "description": "Use the time-series values for each dimension" },
+ { "const": "percentages", "title": "Percentages", "description": "Use the percentage of each dimension vs the sum of all dimensions" },
+ { "const": "anomalies", "title": "Anomalies", "description": "Use the anomaly rate of each dimension" }
+ ],
+ "default": "samples",
+ "title": " ",
+ "description": ""
+ },
+ "time_group": {
+ "type": "string",
+ "oneOf": [
+ { "const": "average", "title": "Average" },
+ { "const": "median", "title": "Median" },
+ { "const": "min", "title": "Minimum" },
+ { "const": "max", "title": "Maximum" },
+ { "const": "sum", "title": "Sum" },
+ { "const": "incremental_sum", "title": "Incremental Sum" },
+ { "const": "stddev", "title": "Standard Deviation" },
+ { "const": "cv", "title": "Coefficient of Variation" },
+ { "const": "trimmed-mean", "title": "Trimmed Mean" },
+ { "const": "trimmed-median", "title": "Trimmed Median" },
+ { "const": "percentile", "title": "Percentile" },
+ { "const": "ses", "title": "Simple Exponential Smoothing" },
+ { "const": "des", "title": "Double Exponential Smoothing" },
+ { "const": "countif", "title": "Count If" }
+ ],
+ "default": "average",
+ "title": "Time Aggregation",
+ "description": ""
+ },
+ "after": {
+ "type": "integer",
+ "default": -600,
+ "title": "From",
+ "description": "Relative to 'To'"
+ },
+ "before": {
+ "type": "integer",
+ "default": 0,
+ "title": "To",
+ "description": "Ending timestamp"
+ },
+ "dims_group": {
+ "type": "string",
+ "oneOf": [
+ { "const": "average", "title": "Average", "description": "The average of all dimensions" },
+ { "const": "min", "title": "Minimum", "description": "The minimum of all dimensions" },
+ { "const": "max", "title": "Maximum", "description": "The maximum of all dimensions" },
+ { "const": "sum", "title": "Sum", "description": "The sum of all dimensions" },
+ { "const": "min2max", "title": "Min-to-Max", "description": "The delta between the minimum of the maximum of the dimensions" }
+ ],
+ "default": "sum",
+ "title": "Dims Aggregation",
+ "description": "on dimensions"
+ },
+ "dimensions": {
+ "type": "string",
+ "title": "Dimensions",
+ "description": "Simple pattern",
+ "default": "*"
+ },
+ "options": {
+ "type": "array",
+ "title": "Time-Series Query options",
+ "description": "Options affecting the way the value is calculated",
+ "uniqueItems": true,
+ "items": {
+ "oneOf": [
+ { "const": "unaligned", "title": "Do not shift the time-frame for visual presentation" },
+ { "const": "abs", "title": "Make all values positive before using them" },
+ { "const": "null2zero", "title": "Treat gaps in the time-series as a zero value" },
+ { "const": "match_ids", "title": "Match only dimension IDs, not Names" },
+ { "const": "match_names", "title": "Match only dimension Names, not IDs" }
+ ]
+ },
+ "default": [ "unaligned" ]
+ }
+ },
+ "allOf": [
+ {
+ "if": {
+ "properties": {
+ "time_group": {
+ "enum": ["trimmed-mean"]
+ }
+ }
+ },
+ "then": {
+ "properties": {
+ "time_group_value": {
+ "type": "integer",
+ "default": 1,
+ "title": "Trim %",
+ "description": ""
+ }
+ },
+ "required": ["time_group_value"]
+ }
+ },
+ {
+ "if": {
+ "properties": {
+ "time_group": {
+ "enum": ["trimmed-median"]
+ }
+ }
+ },
+ "then": {
+ "properties": {
+ "time_group_value": {
+ "type": "integer",
+ "default": 1,
+ "title": "Trim %",
+ "description": ""
+ }
+ },
+ "required": ["time_group_value"]
+ }
+ },
+ {
+ "if": {
+ "properties": {
+ "time_group": {
+ "enum": ["percentile"]
+ }
+ }
+ },
+ "then": {
+ "properties": {
+ "time_group_value": {
+ "type": "integer",
+ "default": 95,
+ "title": "Percentage",
+ "description": ""
+ }
+ },
+ "required": ["time_group_value"]
+ }
+ },
+ {
+ "if": {
+ "properties": {
+ "time_group": {
+ "const": "countif"
+ }
+ }
+ },
+ "then": {
+ "properties": {
+ "time_group_condition": {
+ "type": "string",
+ "oneOf": [
+ { "const": "!=", "title": "!=" },
+ { "const": "=", "title": "==" },
+ { "const": ">=", "title": ">=" },
+ { "const": ">", "title": ">" },
+ { "const": "<=", "title": "<=" },
+ { "const": "<", "title": "<" }
+ ],
+ "default": "equal",
+ "title": "Condition",
+ "description": ""
+ },
+ "time_group_value": {
+ "type": "number",
+ "default": 1,
+ "title": "Value to match",
+ "description": ""
+ }
+ },
+ "required": ["time_group_condition", "time_group_value"]
+ }
+ }
+ ]
+ },
+ "calculation": {
+ "type": "string",
+ "title": "Calculation",
+ "description": "An expression to transform the value"
+ },
+ "units": {
+ "type": "string",
+ "title": "Unit",
+ "description": "of measurement"
+ },
+ "update_every": {
+ "type": "integer",
+ "default": 10,
+ "minimum": 1,
+ "title": "Frequency",
+ "description": "of evaluation"
+ }
+ }
+ },
+ "configConditions": {
+ "type": "object",
+ "title": "",
+ "properties": {
+ "warning_condition": {
+ "type": "string",
+ "title": "Warning Expression",
+ "description": "The alert value is available as '$this'. If this expression evaluates to a non-zero value, the alert is considered to be in warning level."
+ },
+ "critical_condition": {
+ "type": "string",
+ "title": "Critical Expression",
+ "description": "The alert value is available as '$this'. If this expression evaluates to a non-zero value, the alert is considered to be in critical level."
+ }
+ }
+ },
+ "configAction": {
+ "type": "object",
+ "title": "",
+ "description": "The action the alert should take when it transitions states",
+ "properties": {
+ "execute": {
+ "type": "string",
+ "title": "Command to execute when the alert transitions states"
+ },
+ "recipient": {
+ "type": "string",
+ "title": "Recipient(s)"
+ },
+ "options": {
+ "type": "array",
+ "title": "Action Options",
+ "uniqueItems": true,
+ "items": {
+ "oneOf": [
+ { "const": "no-clear-notification", "title": "Do not perform any action when the alert is cleared"}
+ ]
+ },
+ "default": []
+ },
+ "delay": {
+ "type": "object",
+ "title": "Delay the action (notification)",
+ "description": "Rules to postpone the action, to avoid multiple notifications on flapping alerts.",
+ "properties": {
+ "up": {
+ "type": "integer",
+ "title": "Delay when raising"
+ },
+ "down": {
+ "type": "integer",
+ "title": "Delay when going Down"
+ },
+ "multiplier": {
+ "type": "number",
+ "title": "Back-Off"
+ },
+ "max": {
+ "type": "integer",
+ "title": "Max"
+ }
+ }
+ },
+ "repeat": {
+ "type": "object",
+ "title": "Auto-Repeat Action",
+ "description": "Repeat the action while the alert is raised.",
+ "properties": {
+ "enabled": {
+ "type": "boolean"
+ },
+ "warning": {
+ "type": "integer",
+ "title": "Repeat on Warning"
+ },
+ "critical": {
+ "type": "integer",
+ "title": "Repeat on Critical"
+ }
+ }
+ }
+ }
+ },
+ "configInstance": {
+ "type": "object",
+ "title": "Alert Configuration",
+ "description": "The properties that control the value the alert will get, the conditions it will trigger, the back-off for notifications, the auto-repeating of notifications, etc.",
+ "properties": {
+ "match": { "$ref": "#/definitions/matchInstance" },
+ "summary": { "$ref": "#/definitions/configSummary" },
+ "info": { "$ref": "#/definitions/configInfo" },
+ "type": { "$ref": "#/definitions/configType" },
+ "component": { "$ref": "#/definitions/configComponent" },
+ "classification": { "$ref": "#/definitions/configClassification" },
+ "value": { "$ref": "#/definitions/configValue" },
+ "conditions": { "$ref": "#/definitions/configConditions" },
+ "actions": { "$ref": "#/definitions/configAction" }
+ },
+ "required": []
+ },
+ "configTemplate": {
+ "type": "object",
+ "title": "Alert Configuration",
+ "description": "The properties that control the value the alert will get, the conditions it will trigger, the back-off for notifications, the auto-repeating of notifications, etc.",
+ "properties": {
+ "match": { "$ref": "#/definitions/matchTemplate" },
+ "summary": { "$ref": "#/definitions/configSummary" },
+ "info": { "$ref": "#/definitions/configInfo" },
+ "type": { "$ref": "#/definitions/configType" },
+ "component": { "$ref": "#/definitions/configComponent" },
+ "classification": { "$ref": "#/definitions/configClassification" },
+ "value": { "$ref": "#/definitions/configValue" },
+ "conditions": { "$ref": "#/definitions/configConditions" },
+ "action": { "$ref": "#/definitions/configAction" }
+ },
+ "required": []
+ }
+ },
+ "type": "object",
+ "properties": {
+ "name": {
+ "type": "string"
+ },
+ "format_version": {
+ "type": "integer",
+ "default": 1
+ },
+ "rules": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "enabled": {
+ "type": "boolean",
+ "default": true,
+ "title": "Enabled",
+ "description": ""
+ },
+ "type": {
+ "type": "string",
+ "oneOf": [
+ { "const": "instance" , "title": "A specific Instance" },
+ { "const": "template" , "title": "Each of the Instances" }
+ ],
+ "default": "template",
+ "title": "Apply this rule to:",
+ "description": ""
+ }
+ },
+ "required": [ "type", "enabled" ],
+ "if": {
+ "properties": {
+ "type": { "const": "instance" }
+ }
+ },
+ "then": {
+ "properties": {
+ "config": { "$ref": "#/definitions/configInstance" }
+ }
+ },
+ "else": {
+ "properties": {
+ "config": { "$ref": "#/definitions/configTemplate" }
+ }
+ }
+ }
+ }
+ },
+ "required": [
+ "rules"
+ ]
+ },
+ "uiSchema": {
+ "uiOptions": {
+ "fullPage": true
+ },
+ "format_version": {
+ "ui:widget": "hidden"
+ },
+ "name": {
+ "ui:widget": "hidden"
+ },
+ "rules": {
+ "ui:openEmptyItem": true,
+ "items": {
+ "ui:classNames": "dyncfg-grid dyncfg-grid-col-6",
+ "enabled": {
+ "ui:classNames": "dyncfg-grid-col-span-1-2",
+ "ui:widget": "checkbox"
+ },
+ "type": {
+ "ui:classNames": "dyncfg-grid-col-span-5-2",
+ "ui:help": "Rules can be configured to match a specific instance (like a specific disk), or match all the instances (like all the disks). All rules are always checked against all nodes streamed to this Netdata, so the matching rules include patterns to match both instances and nodes.",
+ "ui:widget": "radio",
+ "ui:options": {
+ "flavour": "buttonGroup"
+ }
+ },
+ "config": {
+ "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6",
+ "ui:flavour": "tabs",
+ "ui:options": {
+ "tabs": [
+ {
+ "title": "Match",
+ "fields": [
+ "match"
+ ]
+ },
+ {
+ "title": "Value",
+ "fields": [
+ "value"
+ ]
+ },
+ {
+ "title": "Triggers",
+ "fields": [
+ "conditions"
+ ]
+ },
+ {
+ "title": "Action",
+ "fields": [
+ "action"
+ ]
+ },
+ {
+ "title": "Description",
+ "fields": [
+ "summary",
+ "info",
+ "type",
+ "component",
+ "classification"
+ ]
+ }
+ ]
+ },
+ "match": {
+ "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6",
+ "on": {
+ "ui:classNames": "dyncfg-grid-col-span-1-6"
+ },
+ "host_labels": {
+ "ui:help": "A simple pattern to match the node labels of the nodes this rule is to be applied to. A space separated list of label=value pairs is accepted. Asterisks can be placed anywhere, including the label key. The label keys and their values are available at the labels filter of the charts on the dashboard.",
+ "ui:classNames": "dyncfg-grid-col-span-1-3"
+ },
+ "instance_labels": {
+ "ui:classNames": "dyncfg-grid-col-span-4-3",
+ "ui:help": "A simple pattern to match the instance labels of the instances this rule is to be applied to. A space separated list of label=value pairs is accepted. Asterisks can be placed anywhere, including the label key. The label keys and their values are available at the labels filter of the charts on the dashboard."
+ }
+ },
+ "summary": {
+ "ui:classNames": "dyncfg-grid-col-span-1-3"
+ },
+ "info": {
+ "ui:classNames": "dyncfg-grid-col-span-4-3"
+ },
+ "type": {
+ "ui:classNames": "dyncfg-grid-col-span-1-2"
+ },
+ "component": {
+ "ui:classNames": "dyncfg-grid-col-span-3-2"
+ },
+ "classification": {
+ "ui:classNames": "dyncfg-grid-col-span-5-2"
+ },
+ "value": {
+ "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6",
+ "database_lookup": {
+ "ui:order": ["data_source", "time_group", "time_group_condition", "time_group_value", "after", "before", "dims_group", "dimensions", "options"],
+ "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6",
+ "ui:collapsible": true,
+ "ui:initiallyExpanded": true,
+ "data_source": {
+ "ui:widget": "radio",
+ "ui:options": {
+ "flavour": "buttonGroup"
+ },
+ "ui:classNames": "dyncfg-grid-col-span-1-6"
+ },
+ "time_group": {
+ "ui:help": "When querying time-series data we need to come up with a single value. This function is used to aggregate all the values of the time-series data to a single value.",
+ "ui:classNames": "dyncfg-grid-col-span-1-2"
+ },
+ "time_group_condition": {
+ "ui:classNames": "dyncfg-grid-col-span-3-1"
+ },
+ "time_group_value": {
+ "ui:classNames": "dyncfg-grid-col-span-4-1"
+ },
+ "after": {
+ "ui:help": "The oldest timestamp of the time-series data to be included in the query. Negative values define a duration in seconds in the past of 'To' (so, -60 means a minute ago from 'To').",
+ "ui:classNames": "dyncfg-grid-col-span-1-1"
+ },
+ "before": {
+ "ui:help": "The newest timestamp of the time-series data to be included in the query. Negative value define a duration in seconds in the past (so, -60 means a minute ago). Zero means now.",
+ "ui:classNames": "dyncfg-grid-col-span-2-1"
+ },
+ "dims_group": {
+ "ui:help": "After each dimension has a single computed value, use this algorithm to derive the final value.",
+ "ui:classNames": "dyncfg-grid-col-span-3-2"
+ },
+ "dimensions": {
+ "ui:help": "A simple pattern to match the dimensions that should be included in the query",
+ "ui:classNames": "dyncfg-grid-col-span-5-2"
+ },
+ "options": {
+ "ui:classNames": "dyncfg-grid-col-span-1-6"
+ }
+ },
+ "calculation": {
+ "ui:help": "The database value is available as '$this'. This expression can utilize variables to transform the value of the alert.",
+ "ui:classNames": "dyncfg-grid-col-span-1-4",
+ "ui:placeholder": "$this * 1"
+ },
+ "units": {
+ "ui:help": "The unit of measurement the alert value is expressed with. If unset, the units of the instance the alert is attached to will be used.",
+ "ui:classNames": "dyncfg-grid-col-span-5-1"
+ },
+ "update_every": {
+ "ui:help": "The frequency this alarm is to be evaluated, in seconds.",
+ "ui:classNames": "dyncfg-grid-col-span-6-1"
+ }
+ },
+ "conditions": {
+ "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6",
+ "warning_condition": {
+ "ui:classNames": "dyncfg-grid-col-span-1-6"
+ },
+ "critical_condition": {
+ "ui:classNames": "dyncfg-grid-col-span-1-6"
+ }
+ },
+ "action": {
+ "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6",
+ "execute": {
+ "ui:classNames": "dyncfg-grid-col-span-1-3",
+ "ui:help": "Leave this empty to get the default alert notification"
+ },
+ "recipient": {
+ "ui:classNames": "dyncfg-grid-col-span-4-1",
+ "ui:help": "A space separated list of the recipients of the alert notifications. The special recipient 'silent' prevents this alert from taking any action (i.e. sending notifications)."
+ },
+ "options": {
+ "ui:classNames": "dyncfg-grid-col-span-5-2",
+ "ui:help": "Options related to the actions this alert will take."
+ },
+ "delay": {
+ "ui:collapsible": true,
+ "ui:initiallyExpanded": false,
+ "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6",
+ "up": {
+ "ui:classNames": "dyncfg-grid-col-span-1-2",
+ "ui:help": "Delay the action (notification) that many seconds, when the alert is rising."
+ },
+ "down": {
+ "ui:classNames": "dyncfg-grid-col-span-3-2",
+ "ui:help": "Delay the action (notification) that many seconds, when the alert is recovering."
+ },
+ "multiplier": {
+ "ui:classNames": "dyncfg-grid-col-span-5-1",
+ "ui:help": "Multiply the delay by this number, every time the alert transitions to a new state, while the action (notification) is being delayed."
+ },
+ "max": {
+ "ui:classNames": "dyncfg-grid-col-span-6-1",
+ "ui:help": "The maximum acceptable delay in seconds, for taking the action (notification)."
+ }
+ },
+ "repeat": {
+ "ui:collapsible": true,
+ "ui:initiallyExpanded": false,
+ "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6",
+ "enabled": {
+ "ui:classNames": "dyncfg-grid-col-span-1-2"
+ },
+ "warning": {
+ "ui:classNames": "dyncfg-grid-col-span-3-2",
+ "ui:help": "The number of seconds to repeat the action while the alert is in warning state"
+ },
+ "critical": {
+ "ui:classNames": "dyncfg-grid-col-span-5-2",
+ "ui:help": "The number of seconds to repeat the action while the alert is in critical state"
+ }
+ }
+ },
+ "hash": {
+ "ui:widget": "hidden"
+ },
+ "source_type": {
+ "ui:widget": "hidden"
+ },
+ "source": {
+ "ui:widget": "hidden"
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/src/health/schema.d/health:alert:prototype.json b/src/health/schema.d/health:alert:prototype.json
deleted file mode 100644
index 115b3c2f9..000000000
--- a/src/health/schema.d/health:alert:prototype.json
+++ /dev/null
@@ -1,675 +0,0 @@
-{
- "jsonSchema": {
- "$schema": "http://json-schema.org/draft-07/schema#",
- "definitions": {
- "matchInstanceLabels": {
- "type": "string",
- "default": "*",
- "title": "Only for instances with these labels"
- },
- "matchHostLabels": {
- "type": "string",
- "default": "*",
- "title": "Only for nodes with these host labels"
- },
- "matchInstance": {
- "type": "object",
- "title": "Apply this rule to a single instance",
- "description": "This is a single alert rule that will be applied to the specific instance on all nodes hosted on this Netdata.",
- "properties": {
- "on": {
- "type": "string",
- "default": "",
- "title": "The instance this rule should be applied to.",
- "description": "You can find the instance names on all charts at the instances drop down menu. Do not include the host name in this field."
- },
- "host_labels": { "$ref": "#/definitions/matchHostLabels" },
- "instance_labels": { "$ref": "#/definitions/matchInstanceLabels" }
- },
- "required": [
- "on",
- "host_labels",
- "instance_labels"
- ]
- },
- "matchTemplate": {
- "type": "object",
- "title": "Apply this rule to all instances of a context",
- "description": "This rule defines a template, that will apply this alert to all instances (e.g. disks, network interfaces, nginx servers, etc) on all nodes hosted on this Netdata.",
- "properties": {
- "on": {
- "type": "string",
- "default": "",
- "title": "The context of the instances this rule should be applied to.",
- "description": "The context is the code-name of each chart on the dashboard, that appears at the chart title bar, between the chart title and its unit of measurement, like: system.cpu, disk.io, etc."
- },
- "host_labels": { "$ref": "#/definitions/matchHostLabels" },
- "instance_labels": { "$ref": "#/definitions/matchInstanceLabels" }
- },
- "required": [
- "on",
- "host_labels",
- "instance_labels"
- ]
- },
- "configSummary": {
- "type": "string",
- "title": "Short description of the alert",
- "description": "This field is used in notification as a short description of the alert. Variables, like ${label:key}, are replaced with the value of instance label called 'key'."
- },
- "configInfo": {
- "type": "string",
- "title": "Long description of the alert",
- "description": "This field is used to provide enough information about the type and nature of the alert. Variables, like ${label:key}, are replaced with the value of instance label called 'key'."
- },
- "configType": {
- "type": "string",
- "title": "Alert Type",
- "description": "Use categories like: 'System', 'Containers', 'Web Servers', 'Message Brokers', etc."
- },
- "configComponent": {
- "type": "string",
- "title": "Alert Component",
- "description": "Component is a sub-type of Alert Type. Examples: 'CPU', 'Memory', 'Network', 'Disk', 'Hardware', 'nginx', 'redis', 'postgresql', etc."
- },
- "configClassification": {
- "type": "string",
- "title": "Classification",
- "description": "Use 'Workload', 'Utilization', 'Latency', 'Availability', 'Errors', etc."
- },
- "configValue": {
- "type": "object",
- "title": "",
- "description": "Each alert has a value. This section defines how this value is calculated.",
- "properties": {
- "database_lookup": {
- "type": "object",
- "title": "Database Query to Get Value",
- "description": "The database query to be executed to calculate the value of the alert. When set, the query is executed before any other calculations. The result of the query will be available as $this in further calculations.",
- "properties": {
- "data_source": {
- "type": "string",
- "oneOf": [
- { "const": "samples", "title": "Samples", "description": "Use the time-series values for each dimension" },
- { "const": "percentages", "title": "Percentages", "description": "Use the percentage of each dimension vs the sum of all dimensions" },
- { "const": "anomalies", "title": "Anomalies", "description": "Use the anomaly rate of each dimension" }
- ],
- "default": "samples",
- "title": " ",
- "description": ""
- },
- "time_group": {
- "type": "string",
- "oneOf": [
- { "const": "average", "title": "Average" },
- { "const": "median", "title": "Median" },
- { "const": "min", "title": "Minimum" },
- { "const": "max", "title": "Maximum" },
- { "const": "sum", "title": "Sum" },
- { "const": "incremental_sum", "title": "Incremental Sum" },
- { "const": "stddev", "title": "Standard Deviation" },
- { "const": "cv", "title": "Coefficient of Variation" },
- { "const": "trimmed-mean", "title": "Trimmed Mean" },
- { "const": "trimmed-median", "title": "Trimmed Median" },
- { "const": "percentile", "title": "Percentile" },
- { "const": "ses", "title": "Simple Exponential Smoothing" },
- { "const": "des", "title": "Double Exponential Smoothing" },
- { "const": "countif", "title": "Count If" }
- ],
- "default": "average",
- "title": "Time Aggregation",
- "description": ""
- },
- "after": {
- "type": "integer",
- "default": -600,
- "title": "From",
- "description": "Relative to 'To'"
- },
- "before": {
- "type": "integer",
- "default": 0,
- "title": "To",
- "description": "Ending timestamp"
- },
- "dims_group": {
- "type": "string",
- "oneOf": [
- { "const": "average", "title": "Average", "description": "The average of all dimensions" },
- { "const": "min", "title": "Minimum", "description": "The minimum of all dimensions" },
- { "const": "max", "title": "Maximum", "description": "The maximum of all dimensions" },
- { "const": "sum", "title": "Sum", "description": "The sum of all dimensions" },
- { "const": "min2max", "title": "Min-to-Max", "description": "The delta between the minimum of the maximum of the dimensions" }
- ],
- "default": "sum",
- "title": "Dims Aggregation",
- "description": "on dimensions"
- },
- "dimensions": {
- "type": "string",
- "title": "Dimensions",
- "description": "Simple pattern",
- "default": "*"
- },
- "options": {
- "type": "array",
- "title": "Time-Series Query options",
- "description": "Options affecting the way the value is calculated",
- "uniqueItems": true,
- "items": {
- "oneOf": [
- { "const": "unaligned", "title": "Do not shift the time-frame for visual presentation" },
- { "const": "abs", "title": "Make all values positive before using them" },
- { "const": "null2zero", "title": "Treat gaps in the time-series as a zero value" },
- { "const": "match_ids", "title": "Match only dimension IDs, not Names" },
- { "const": "match_names", "title": "Match only dimension Names, not IDs" }
- ]
- },
- "default": [ "unaligned" ]
- }
- },
- "allOf": [
- {
- "if": {
- "properties": {
- "time_group": {
- "enum": ["trimmed-mean"]
- }
- }
- },
- "then": {
- "properties": {
- "time_group_value": {
- "type": "integer",
- "default": 1,
- "title": "Trim %",
- "description": ""
- }
- },
- "required": ["time_group_value"]
- }
- },
- {
- "if": {
- "properties": {
- "time_group": {
- "enum": ["trimmed-median"]
- }
- }
- },
- "then": {
- "properties": {
- "time_group_value": {
- "type": "integer",
- "default": 1,
- "title": "Trim %",
- "description": ""
- }
- },
- "required": ["time_group_value"]
- }
- },
- {
- "if": {
- "properties": {
- "time_group": {
- "enum": ["percentile"]
- }
- }
- },
- "then": {
- "properties": {
- "time_group_value": {
- "type": "integer",
- "default": 95,
- "title": "Percentage",
- "description": ""
- }
- },
- "required": ["time_group_value"]
- }
- },
- {
- "if": {
- "properties": {
- "time_group": {
- "const": "countif"
- }
- }
- },
- "then": {
- "properties": {
- "time_group_condition": {
- "type": "string",
- "oneOf": [
- { "const": "!=", "title": "!=" },
- { "const": "=", "title": "==" },
- { "const": ">=", "title": ">=" },
- { "const": ">", "title": ">" },
- { "const": "<=", "title": "<=" },
- { "const": "<", "title": "<" }
- ],
- "default": "equal",
- "title": "Condition",
- "description": ""
- },
- "time_group_value": {
- "type": "number",
- "default": 1,
- "title": "Value to match",
- "description": ""
- }
- },
- "required": ["time_group_condition", "time_group_value"]
- }
- }
- ]
- },
- "calculation": {
- "type": "string",
- "title": "Calculation",
- "description": "An expression to transform the value"
- },
- "units": {
- "type": "string",
- "title": "Unit",
- "description": "of measurement"
- }
- }
- },
- "configConditions": {
- "type": "object",
- "title": "",
- "properties": {
- "warning_condition": {
- "type": "string",
- "title": "Warning Expression",
- "description": "The alert value is available as '$this'. If this expression evaluates to a non-zero value, the alert is considered to be in warning level."
- },
- "critical_condition": {
- "type": "string",
- "title": "Critical Expression",
- "description": "The alert value is available as '$this'. If this expression evaluates to a non-zero value, the alert is considered to be in critical level."
- }
- }
- },
- "configAction": {
- "type": "object",
- "title": "",
- "description": "The action the alert should take when it transitions states",
- "properties": {
- "execute": {
- "type": "string",
- "title": "Command to execute when the alert transitions states"
- },
- "recipient": {
- "type": "string",
- "title": "Recipient(s)"
- },
- "options": {
- "type": "array",
- "title": "Action Options",
- "uniqueItems": true,
- "items": {
- "oneOf": [
- { "const": "no-clear-notification", "title": "Do not perform any action when the alert is cleared"}
- ]
- },
- "default": []
- },
- "delay": {
- "type": "object",
- "title": "Delay the action (notification)",
- "description": "Rules to postpone the action, to avoid multiple notifications on flapping alerts.",
- "properties": {
- "up": {
- "type": "integer",
- "title": "Delay when raising"
- },
- "down": {
- "type": "integer",
- "title": "Delay when going Down"
- },
- "multiplier": {
- "type": "number",
- "title": "Back-Off"
- },
- "max": {
- "type": "integer",
- "title": "Max"
- }
- }
- },
- "repeat": {
- "type": "object",
- "title": "Auto-Repeat Action",
- "description": "Repeat the action while the alert is raised.",
- "properties": {
- "enabled": {
- "type": "boolean"
- },
- "warning": {
- "type": "integer",
- "title": "Repeat on Warning"
- },
- "critical": {
- "type": "integer",
- "title": "Repeat on Critical"
- }
- }
- }
- }
- },
- "configInstance": {
- "type": "object",
- "title": "Alert Configuration",
- "description": "The properties that control the value the alert will get, the conditions it will trigger, the back-off for notifications, the auto-repeating of notifications, etc.",
- "properties": {
- "match": { "$ref": "#/definitions/matchInstance" },
- "summary": { "$ref": "#/definitions/configSummary" },
- "info": { "$ref": "#/definitions/configInfo" },
- "type": { "$ref": "#/definitions/configType" },
- "component": { "$ref": "#/definitions/configComponent" },
- "classification": { "$ref": "#/definitions/configClassification" },
- "value": { "$ref": "#/definitions/configValue" },
- "conditions": { "$ref": "#/definitions/configConditions" },
- "actions": { "$ref": "#/definitions/configAction" }
- },
- "required": []
- },
- "configTemplate": {
- "type": "object",
- "title": "Alert Configuration",
- "description": "The properties that control the value the alert will get, the conditions it will trigger, the back-off for notifications, the auto-repeating of notifications, etc.",
- "properties": {
- "match": { "$ref": "#/definitions/matchTemplate" },
- "summary": { "$ref": "#/definitions/configSummary" },
- "info": { "$ref": "#/definitions/configInfo" },
- "type": { "$ref": "#/definitions/configType" },
- "component": { "$ref": "#/definitions/configComponent" },
- "classification": { "$ref": "#/definitions/configClassification" },
- "value": { "$ref": "#/definitions/configValue" },
- "conditions": { "$ref": "#/definitions/configConditions" },
- "action": { "$ref": "#/definitions/configAction" }
- },
- "required": []
- }
- },
- "type": "object",
- "properties": {
- "name": {
- "type": "string"
- },
- "format_version": {
- "type": "integer",
- "default": 1
- },
- "rules": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "enabled": {
- "type": "boolean",
- "default": true,
- "title": "Enabled",
- "description": ""
- },
- "type": {
- "type": "string",
- "oneOf": [
- { "const": "instance" , "title": "A specific Instance" },
- { "const": "template" , "title": "Each of the Instances" }
- ],
- "default": "template",
- "title": "Apply this rule to:",
- "description": ""
- }
- },
- "required": [ "type", "enabled" ],
- "if": {
- "properties": {
- "type": { "const": "instance" }
- }
- },
- "then": {
- "properties": {
- "config": { "$ref": "#/definitions/configInstance" }
- }
- },
- "else": {
- "properties": {
- "config": { "$ref": "#/definitions/configTemplate" }
- }
- }
- }
- }
- },
- "required": [
- "rules"
- ]
- },
- "uiSchema": {
- "uiOptions": {
- "fullPage": true
- },
- "format_version": {
- "ui:widget": "hidden"
- },
- "name": {
- "ui:widget": "hidden"
- },
- "rules": {
- "items": {
- "ui:classNames": "dyncfg-grid dyncfg-grid-col-6",
- "enabled": {
- "ui:classNames": "dyncfg-grid-col-span-1-2",
- "ui:widget": "checkbox"
- },
- "type": {
- "ui:classNames": "dyncfg-grid-col-span-5-2",
- "ui:help": "Rules can be configured to match a specific instance (like a specific disk), or match all the instances (like all the disks). All rules are always checked against all nodes streamed to this Netdata, so the matching rules include patterns to match both instances and nodes.",
- "ui:widget": "radio",
- "ui:options": {
- "flavour": "buttonGroup"
- }
- },
- "config": {
- "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6",
- "ui:flavour": "tabs",
- "ui:options": {
- "tabs": [
- {
- "title": "Match",
- "fields": [
- "match"
- ]
- },
- {
- "title": "Value",
- "fields": [
- "value"
- ]
- },
- {
- "title": "Triggers",
- "fields": [
- "conditions"
- ]
- },
- {
- "title": "Action",
- "fields": [
- "action"
- ]
- },
- {
- "title": "Description",
- "fields": [
- "summary",
- "info",
- "type",
- "component",
- "classification"
- ]
- }
- ]
- },
- "match": {
- "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6",
- "on": {
- "ui:classNames": "dyncfg-grid-col-span-1-6"
- },
- "host_labels": {
- "ui:help": "A simple pattern to match the node labels of the nodes this rule is to be applied to. A space separated list of label=value pairs is accepted. Asterisks can be placed anywhere, including the label key. The label keys and their values are available at the labels filter of the charts on the dashboard.",
- "ui:classNames": "dyncfg-grid-col-span-1-3"
- },
- "instance_labels": {
- "ui:classNames": "dyncfg-grid-col-span-4-3",
- "ui:help": "A simple pattern to match the instance labels of the instances this rule is to be applied to. A space separated list of label=value pairs is accepted. Asterisks can be placed anywhere, including the label key. The label keys and their values are available at the labels filter of the charts on the dashboard."
- }
- },
- "summary": {
- "ui:classNames": "dyncfg-grid-col-span-1-3"
- },
- "info": {
- "ui:classNames": "dyncfg-grid-col-span-4-3"
- },
- "type": {
- "ui:classNames": "dyncfg-grid-col-span-1-2"
- },
- "component": {
- "ui:classNames": "dyncfg-grid-col-span-3-2"
- },
- "classification": {
- "ui:classNames": "dyncfg-grid-col-span-5-2"
- },
- "value": {
- "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6",
- "database_lookup": {
- "ui:order": ["data_source", "time_group", "time_group_condition", "time_group_value", "after", "before", "dims_group", "dimensions", "options"],
- "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6",
- "ui:collapsible": true,
- "ui:initiallyExpanded": true,
- "data_source": {
- "ui:widget": "radio",
- "ui:options": {
- "flavour": "buttonGroup"
- },
- "ui:classNames": "dyncfg-grid-col-span-1-6"
- },
- "time_group": {
- "ui:help": "When querying time-series data we need to come up with a single value. This function is used to aggregate all the values of the time-series data to a single value.",
- "ui:classNames": "dyncfg-grid-col-span-1-2"
- },
- "time_group_condition": {
- "ui:classNames": "dyncfg-grid-col-span-3-1"
- },
- "time_group_value": {
- "ui:classNames": "dyncfg-grid-col-span-4-1"
- },
- "after": {
- "ui:help": "The oldest timestamp of the time-series data to be included in the query. Negative values define a duration in seconds in the past of 'To' (so, -60 means a minute ago from 'To').",
- "ui:classNames": "dyncfg-grid-col-span-1-1"
- },
- "before": {
- "ui:help": "The newest timestamp of the time-series data to be included in the query. Negative value define a duration in seconds in the past (so, -60 means a minute ago). Zero means now.",
- "ui:classNames": "dyncfg-grid-col-span-2-1"
- },
- "dims_group": {
- "ui:help": "After each dimension has a single computed value, use this algorithm to derive the final value.",
- "ui:classNames": "dyncfg-grid-col-span-3-2"
- },
- "dimensions": {
- "ui:help": "A simple pattern to match the dimensions that should be included in the query",
- "ui:classNames": "dyncfg-grid-col-span-5-2"
- },
- "options": {
- "ui:classNames": "dyncfg-grid-col-span-1-6"
- }
- },
- "calculation": {
- "ui:help": "The database value is available as '$this'. This expression can utilize variables to transform the value of the alert.",
- "ui:classNames": "dyncfg-grid-col-span-1-5",
- "ui:placeholder": "$this * 1"
- },
- "units": {
- "ui:help": "The unit of measurement the alert value is expressed with. If unset, the units of the instance the alert is attached to will be used.",
- "ui:classNames": "dyncfg-grid-col-span-6-1"
- }
- },
- "conditions": {
- "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6",
- "warning_condition": {
- "ui:classNames": "dyncfg-grid-col-span-1-6"
- },
- "critical_condition": {
- "ui:classNames": "dyncfg-grid-col-span-1-6"
- }
- },
- "action": {
- "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6",
- "execute": {
- "ui:classNames": "dyncfg-grid-col-span-1-3",
- "ui:help": "Leave this empty to get the default alert notification"
- },
- "recipient": {
- "ui:classNames": "dyncfg-grid-col-span-4-1",
- "ui:help": "A space separated list of the recipients of the alert notifications. The special recipient 'silent' prevents this alert from taking any action (i.e. sending notifications)."
- },
- "options": {
- "ui:classNames": "dyncfg-grid-col-span-5-2",
- "ui:help": "Options related to the actions this alert will take."
- },
- "delay": {
- "ui:collapsible": true,
- "ui:initiallyExpanded": false,
- "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6",
- "up": {
- "ui:classNames": "dyncfg-grid-col-span-1-2",
- "ui:help": "Delay the action (notification) that many seconds, when the alert is rising."
- },
- "down": {
- "ui:classNames": "dyncfg-grid-col-span-3-2",
- "ui:help": "Delay the action (notification) that many seconds, when the alert is recovering."
- },
- "multiplier": {
- "ui:classNames": "dyncfg-grid-col-span-5-1",
- "ui:help": "Multiply the delay by this number, every time the alert transitions to a new state, while the action (notification) is being delayed."
- },
- "max": {
- "ui:classNames": "dyncfg-grid-col-span-6-1",
- "ui:help": "The maximum acceptable delay in seconds, for taking the action (notification)."
- }
- },
- "repeat": {
- "ui:collapsible": true,
- "ui:initiallyExpanded": false,
- "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6",
- "enabled": {
- "ui:classNames": "dyncfg-grid-col-span-1-2"
- },
- "warning": {
- "ui:classNames": "dyncfg-grid-col-span-3-2",
- "ui:help": "The number of seconds to repeat the action while the alert is in warning state"
- },
- "critical": {
- "ui:classNames": "dyncfg-grid-col-span-5-2",
- "ui:help": "The number of seconds to repeat the action while the alert is in critical state"
- }
- }
- },
- "hash": {
- "ui:widget": "hidden"
- },
- "source_type": {
- "ui:widget": "hidden"
- },
- "source": {
- "ui:widget": "hidden"
- }
- }
- }
- }
- }
-}